2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,7 @@
"""
Default plugins for rdflib.
This is a namespace package and contains the default plugins for
rdflib.
"""
@@ -0,0 +1,19 @@
from rdflib.namespace import RDF # noqa: N999
from rdflib.term import URIRef
class RDFVOC(RDF):
_underscore_num = True
_fail = True
# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
# A mapping from unqualified terms to their qualified version.
RDF: URIRef
Description: URIRef
ID: URIRef
about: URIRef
parseType: URIRef # noqa: N815
resource: URIRef
li: URIRef
nodeID: URIRef # noqa: N815
datatype: URIRef
@@ -0,0 +1,172 @@
"""
This is a rdflib plugin for parsing Hextuple files, which are Newline-Delimited JSON
(ndjson) files, into Conjunctive. The store that backs the graph *must* be able to
handle contexts, i.e. multiple graphs.
"""
from __future__ import annotations
import json
import warnings
from io import TextIOWrapper
from typing import TYPE_CHECKING, Any, BinaryIO, List, Optional, TextIO, Union
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode, Literal, URIRef
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
if TYPE_CHECKING:
from io import BufferedReader
__all__ = ["HextuplesParser"]
class HextuplesParser(Parser):
"""
An RDFLib parser for Hextuples
"""
def __init__(self):
super(HextuplesParser, self).__init__()
self.default_context: Optional[Graph] = None
self.skolemize = False
def _parse_hextuple(
self, ds: Union[Dataset, ConjunctiveGraph], tup: List[Union[str, None]]
) -> None:
# all values check
# subject, predicate, value, datatype cannot be None
# language and graph may be None
if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None:
raise ValueError(
f"subject, predicate, value, datatype cannot be None. Given: {tup}"
)
# 1 - subject
s: Union[URIRef, BNode]
if tup[0].startswith("_"):
s = BNode(value=tup[0].replace("_:", ""))
if self.skolemize:
s = s.skolemize()
else:
s = URIRef(tup[0])
# 2 - predicate
p = URIRef(tup[1])
# 3 - value
o: Union[URIRef, BNode, Literal]
if tup[3] == "globalId":
o = URIRef(tup[2])
elif tup[3] == "localId":
o = BNode(value=tup[2].replace("_:", ""))
if self.skolemize:
o = o.skolemize()
else: # literal
if tup[4] is None:
o = Literal(tup[2], datatype=URIRef(tup[3]))
else:
o = Literal(tup[2], lang=tup[4])
# 6 - context
if tup[5] is not None:
c = (
BNode(tup[5].replace("_:", ""))
if tup[5].startswith("_:")
else URIRef(tup[5])
)
if isinstance(c, BNode) and self.skolemize:
c = c.skolemize()
ds.get_context(c).add((s, p, o))
elif self.default_context is not None:
self.default_context.add((s, p, o))
else:
raise Exception("No context to parse into!")
# type error: Signature of "parse" incompatible with supertype "Parser"
def parse(self, source: InputSource, graph: Graph, skolemize: bool = False, **kwargs: Any) -> None: # type: ignore[override]
if kwargs.get("encoding") not in [None, "utf-8"]:
warnings.warn(
f"Hextuples files are always utf-8 encoded, "
f"I was passed: {kwargs.get('encoding')}, "
"but I'm still going to use utf-8"
)
assert (
graph.store.context_aware
), "Hextuples Parser needs a context-aware store!"
self.skolemize = skolemize
# Set default_union to True to mimic ConjunctiveGraph behavior
ds = Dataset(store=graph.store, default_union=True)
ds_default = ds.default_context # the DEFAULT_DATASET_GRAPH_ID
if isinstance(graph, (Dataset, ConjunctiveGraph)):
self.default_context = graph.default_context
elif graph.identifier is not None:
if graph.identifier == ds_default.identifier:
self.default_context = graph
else:
self.default_context = ds.get_context(graph.identifier)
else:
# mypy thinks this is unreachable, but graph.identifier can be None
self.default_context = ds_default # type: ignore[unreachable]
if self.default_context is not ds_default:
ds.default_context = self.default_context
ds.remove_graph(ds_default) # remove the original unused default graph
try:
text_stream: Optional[TextIO] = source.getCharacterStream()
except (AttributeError, LookupError):
text_stream = None
try:
binary_stream: Optional[BinaryIO] = source.getByteStream()
except (AttributeError, LookupError):
binary_stream = None
if text_stream is None and binary_stream is None:
raise ValueError(
f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
)
if TYPE_CHECKING:
assert text_stream is not None or binary_stream is not None
use_stream: Union[TextIO, BinaryIO]
if _HAS_ORJSON:
if binary_stream is not None:
use_stream = binary_stream
else:
if TYPE_CHECKING:
assert isinstance(text_stream, TextIOWrapper)
use_stream = text_stream
loads = orjson.loads
else:
if text_stream is not None:
use_stream = text_stream
else:
if TYPE_CHECKING:
assert isinstance(binary_stream, BufferedReader)
use_stream = TextIOWrapper(binary_stream, encoding="utf-8")
loads = json.loads
for line in use_stream: # type: Union[str, bytes]
if len(line) == 0 or line.isspace():
# Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
# The result is that we accept input that would otherwise be invalid.
# Possibly we should just let this result in an error.
continue
# this complex handing is because the 'value' component is
# allowed to be "" but not None
# all other "" values are treated as None
raw_line: List[str] = loads(line)
hex_tuple_line = [x if x != "" else None for x in raw_line]
if raw_line[2] == "":
hex_tuple_line[2] = ""
self._parse_hextuple(ds, hex_tuple_line)
@@ -0,0 +1,712 @@
"""
This parser will interpret a JSON-LD document as an RDF Graph. See:
http://json-ld.org/
Example usage::
>>> from rdflib import Graph, URIRef, Literal
>>> test_json = '''
... {
... "@context": {
... "dc": "http://purl.org/dc/terms/",
... "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
... "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
... },
... "@id": "http://example.org/about",
... "dc:title": {
... "@language": "en",
... "@value": "Someone's Homepage"
... }
... }
... '''
>>> g = Graph().parse(data=test_json, format='json-ld')
>>> list(g) == [(URIRef('http://example.org/about'),
... URIRef('http://purl.org/dc/terms/title'),
... Literal("Someone's Homepage", lang='en'))]
True
"""
# From: https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/parser.py
# NOTE: This code reads the entire JSON object into memory before parsing, but
# we should consider streaming the input to deal with arbitrarily large graphs.
from __future__ import annotations
import secrets
import warnings
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
import rdflib.parser
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.namespace import RDF, XSD
from rdflib.parser import InputSource, URLInputSource
from rdflib.term import BNode, IdentifiedNode, Literal, Node, URIRef
from ..shared.jsonld.context import UNDEF, Context, Term
from ..shared.jsonld.keys import (
CONTEXT,
GRAPH,
ID,
INCLUDED,
INDEX,
JSON,
LANG,
LIST,
NEST,
NONE,
REV,
SET,
TYPE,
VALUE,
VOCAB,
)
from ..shared.jsonld.util import (
_HAS_ORJSON,
VOCAB_DELIMS,
context_from_urlinputsource,
json,
orjson,
source_to_json,
)
__all__ = ["JsonLDParser", "to_rdf"]
TYPE_TERM = Term(str(RDF.type), TYPE, VOCAB) # type: ignore[call-arg]
ALLOW_LISTS_OF_LISTS = True # NOTE: Not allowed in JSON-LD 1.0
class JsonLDParser(rdflib.parser.Parser):
def __init__(self):
super(JsonLDParser, self).__init__()
def parse(
self,
source: InputSource,
sink: Graph,
version: float = 1.1,
skolemize: bool = False,
encoding: Optional[str] = "utf-8",
base: Optional[str] = None,
context: Optional[
Union[
List[Union[Dict[str, Any], str, None]],
Dict[str, Any],
str,
]
] = None,
generalized_rdf: Optional[bool] = False,
extract_all_scripts: Optional[bool] = False,
**kwargs: Any,
) -> None:
"""Parse JSON-LD from a source document.
The source document can be JSON or HTML with embedded JSON script
elements (type attribute = "application/ld+json"). To process as HTML
``source.content_type`` must be set to "text/html" or
"application/xhtml+xml".
:param source: InputSource with JSON-formatted data (JSON or HTML)
:param sink: Graph to receive the parsed triples
:param version: parse as JSON-LD version, defaults to 1.1
:param encoding: character encoding of the JSON (should be "utf-8"
or "utf-16"), defaults to "utf-8"
:param base: JSON-LD `Base IRI <https://www.w3.org/TR/json-ld/#base-iri>`_, defaults to None
:param context: JSON-LD `Context <https://www.w3.org/TR/json-ld/#the-context>`_, defaults to None
:param generalized_rdf: parse as `Generalized RDF <https://www.w3.org/TR/json-ld/#relationship-to-rdf>`_, defaults to False
:param extract_all_scripts: if source is an HTML document then extract
all script elements, defaults to False (extract only the first
script element). This is ignored if ``source.system_id`` contains
a fragment identifier, in which case only the script element with
matching id attribute is extracted.
"""
if encoding not in ("utf-8", "utf-16"):
warnings.warn(
"JSON should be encoded as unicode. "
"Given encoding was: %s" % encoding
)
if not base:
base = sink.absolutize(source.getPublicId() or source.getSystemId() or "")
context_data = context
if not context_data and hasattr(source, "url") and hasattr(source, "links"):
if TYPE_CHECKING:
assert isinstance(source, URLInputSource)
context_data = context_from_urlinputsource(source)
try:
version = float(version)
except ValueError:
version = 1.1
# Get the optional fragment identifier
try:
fragment_id = URIRef(source.getSystemId()).fragment
except Exception:
fragment_id = None
data, html_base = source_to_json(source, fragment_id, extract_all_scripts)
if html_base is not None:
base = URIRef(html_base, base=base)
# NOTE: A ConjunctiveGraph parses into a Graph sink, so no sink will be
# context_aware. Keeping this check in case RDFLib is changed, or
# someone passes something context_aware to this parser directly.
conj_sink: Graph
if not sink.context_aware:
conj_sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier)
else:
conj_sink = sink
to_rdf(
data,
conj_sink,
base,
context_data,
version,
bool(generalized_rdf),
skolemize=skolemize,
)
def to_rdf(
data: Any,
dataset: Graph,
base: Optional[str] = None,
context_data: Optional[
Union[
List[Union[Dict[str, Any], str, None]],
Dict[str, Any],
str,
]
] = None,
version: Optional[float] = None,
generalized_rdf: bool = False,
allow_lists_of_lists: Optional[bool] = None,
skolemize: bool = False,
):
# TODO: docstring w. args and return value
context = Context(base=base, version=version)
if context_data:
context.load(context_data)
parser = Parser(
generalized_rdf=generalized_rdf,
allow_lists_of_lists=allow_lists_of_lists,
skolemize=skolemize,
)
return parser.parse(data, context, dataset)
class Parser:
def __init__(
self,
generalized_rdf: bool = False,
allow_lists_of_lists: Optional[bool] = None,
skolemize: bool = False,
):
self.skolemize = skolemize
self.generalized_rdf = generalized_rdf
self.allow_lists_of_lists = (
allow_lists_of_lists
if allow_lists_of_lists is not None
else ALLOW_LISTS_OF_LISTS
)
self.invalid_uri_to_bnode: dict[str, BNode] = {}
def parse(self, data: Any, context: Context, dataset: Graph) -> Graph:
topcontext = False
resources: Union[Dict[str, Any], List[Any]]
if isinstance(data, list):
resources = data
elif isinstance(data, dict):
local_context = data.get(CONTEXT)
if local_context:
context.load(local_context, context.base)
topcontext = True
resources = data
# type error: Subclass of "Dict[str, Any]" and "List[Any]" cannot exist: would have incompatible method signatures
if not isinstance(resources, list): # type: ignore[unreachable]
resources = [resources]
if context.vocab:
dataset.bind(None, context.vocab)
for name, term in context.terms.items():
if term.id and term.id.endswith(VOCAB_DELIMS):
dataset.bind(name, term.id)
# type error: "Graph" has no attribute "default_context"
graph = dataset.default_context if dataset.context_aware else dataset # type: ignore[attr-defined]
for node in resources:
self._add_to_graph(dataset, graph, context, node, topcontext)
return graph
def _add_to_graph(
self,
dataset: Graph,
graph: Graph,
context: Context,
node: Any,
topcontext: bool = False,
) -> Optional[Node]:
if not isinstance(node, dict) or context.get_value(node):
# type error: Return value expected
return # type: ignore[return-value]
if CONTEXT in node and not topcontext:
local_context = node[CONTEXT]
if local_context:
context = context.subcontext(local_context)
else:
context = Context(base=context.doc_base)
# type error: Incompatible types in assignment (expression has type "Optional[Context]", variable has type "Context")
context = context.get_context_for_type(node) # type: ignore[assignment]
id_val = context.get_id(node)
if id_val is None:
nested_id = self._get_nested_id(context, node)
if nested_id is not None and len(nested_id) > 0:
id_val = nested_id
if isinstance(id_val, str):
subj = self._to_rdf_id(context, id_val)
else:
subj = BNode()
if self.skolemize:
subj = subj.skolemize()
if subj is None:
return None
# NOTE: crude way to signify that this node might represent a named graph
no_id = id_val is None
for key, obj in node.items():
if key == CONTEXT or key in context.get_keys(ID):
continue
if key == REV or key in context.get_keys(REV):
for rkey, robj in obj.items():
self._key_to_graph(
dataset,
graph,
context,
subj,
rkey,
robj,
reverse=True,
no_id=no_id,
)
else:
self._key_to_graph(dataset, graph, context, subj, key, obj, no_id=no_id)
return subj
# type error: Missing return statement
def _get_nested_id(self, context: Context, node: Dict[str, Any]) -> Optional[str]: # type: ignore[return]
for key, obj in node.items():
if context.version >= 1.1 and key in context.get_keys(NEST):
term = context.terms.get(key)
if term and term.id is None:
continue
objs = obj if isinstance(obj, list) else [obj]
for obj in objs:
if not isinstance(obj, dict):
continue
id_val = context.get_id(obj)
if not id_val:
subcontext = context.get_context_for_term(
context.terms.get(key)
)
id_val = self._get_nested_id(subcontext, obj)
if isinstance(id_val, str):
return id_val
def _key_to_graph(
self,
dataset: Graph,
graph: Graph,
context: Context,
subj: Node,
key: str,
obj: Any,
reverse: bool = False,
no_id: bool = False,
) -> None:
if isinstance(obj, list):
obj_nodes = obj
else:
obj_nodes = [obj]
term = context.terms.get(key)
if term:
term_id = term.id
if term.type == JSON:
obj_nodes = [self._to_typed_json_value(obj)]
elif LIST in term.container:
obj_nodes = [self._expand_nested_list(obj_nodes)]
elif isinstance(obj, dict):
obj_nodes = self._parse_container(context, term, obj)
else:
term_id = None
if TYPE in (key, term_id):
term = TYPE_TERM
if GRAPH in (key, term_id):
if dataset.context_aware and not no_id:
if TYPE_CHECKING:
assert isinstance(dataset, ConjunctiveGraph)
# type error: Argument 1 to "get_context" of "ConjunctiveGraph" has incompatible type "Node"; expected "Union[IdentifiedNode, str, None]"
subgraph = dataset.get_context(subj) # type: ignore[arg-type]
else:
subgraph = graph
for onode in obj_nodes:
self._add_to_graph(dataset, subgraph, context, onode)
return
if SET in (key, term_id):
for onode in obj_nodes:
self._add_to_graph(dataset, graph, context, onode)
return
if INCLUDED in (key, term_id):
for onode in obj_nodes:
self._add_to_graph(dataset, graph, context, onode)
return
if context.version >= 1.1 and key in context.get_keys(NEST):
term = context.terms.get(key)
if term and term.id is None:
return
objs = obj if isinstance(obj, list) else [obj]
for obj in objs:
if not isinstance(obj, dict):
continue
for nkey, nobj in obj.items():
# NOTE: we've already captured subject
if nkey in context.get_keys(ID):
continue
subcontext = context.get_context_for_type(obj)
# type error: Argument 3 to "_key_to_graph" of "Parser" has incompatible type "Optional[Context]"; expected "Context"
self._key_to_graph(dataset, graph, subcontext, subj, nkey, nobj) # type: ignore[arg-type]
return
pred_uri = term.id if term else context.expand(key)
context = context.get_context_for_term(term)
# Flatten deep nested lists
def flatten(n: Iterable[Any]) -> List[Any]:
flattened = []
for obj in n:
if isinstance(obj, dict):
objs = context.get_set(obj)
if objs is not None:
obj = objs
if isinstance(obj, list):
flattened += flatten(obj)
continue
flattened.append(obj)
return flattened
obj_nodes = flatten(obj_nodes)
if not pred_uri:
return
if term and term.reverse:
reverse = not reverse
pred: IdentifiedNode
bid = self._get_bnodeid(pred_uri)
if bid:
if not self.generalized_rdf:
return
pred = BNode(bid)
if self.skolemize:
pred = pred.skolemize()
else:
pred = URIRef(pred_uri)
for obj_node in obj_nodes:
obj = self._to_object(dataset, graph, context, term, obj_node)
if obj is None:
continue
if reverse:
graph.add((obj, pred, subj))
else:
graph.add((subj, pred, obj))
def _parse_container(
self, context: Context, term: Term, obj: Dict[str, Any]
) -> List[Any]:
if LANG in term.container:
obj_nodes = []
for lang, values in obj.items():
if not isinstance(values, list):
values = [values]
if lang in context.get_keys(NONE):
obj_nodes += values
else:
for v in values:
obj_nodes.append((v, lang))
return obj_nodes
v11 = context.version >= 1.1
if v11 and GRAPH in term.container and ID in term.container:
return [
(
dict({GRAPH: o})
if k in context.get_keys(NONE)
else dict({ID: k, GRAPH: o}) if isinstance(o, dict) else o
)
for k, o in obj.items()
]
elif v11 and GRAPH in term.container and INDEX in term.container:
return [dict({GRAPH: o}) for k, o in obj.items()]
elif v11 and GRAPH in term.container:
return [dict({GRAPH: obj})]
elif v11 and ID in term.container:
return [
(
dict({ID: k}, **o)
if isinstance(o, dict) and k not in context.get_keys(NONE)
else o
)
for k, o in obj.items()
]
elif v11 and TYPE in term.container:
return [
(
self._add_type(
context,
(
{ID: context.expand(o) if term.type == VOCAB else o}
if isinstance(o, str)
else o
),
k,
)
if isinstance(o, (dict, str)) and k not in context.get_keys(NONE)
else o
)
for k, o in obj.items()
]
elif INDEX in term.container:
obj_nodes = []
for key, nodes in obj.items():
if not isinstance(nodes, list):
nodes = [nodes]
for node in nodes:
if v11 and term.index and key not in context.get_keys(NONE):
if not isinstance(node, dict):
node = {ID: node}
values = node.get(term.index, [])
if not isinstance(values, list):
values = [values]
values.append(key)
node[term.index] = values
obj_nodes.append(node)
return obj_nodes
return [obj]
@staticmethod
def _add_type(context: Context, o: Dict[str, Any], k: str) -> Dict[str, Any]:
otype = context.get_type(o) or []
if otype and not isinstance(otype, list):
otype = [otype]
otype.append(k)
o[TYPE] = otype
return o
def _to_object(
self,
dataset: Graph,
graph: Graph,
context: Context,
term: Optional[Term],
node: Any,
inlist: bool = False,
) -> Optional[Node]:
if isinstance(node, tuple):
value, lang = node
if value is None:
# type error: Return value expected
return # type: ignore[return-value]
if lang and " " in lang:
# type error: Return value expected
return # type: ignore[return-value]
return Literal(value, lang=lang)
if isinstance(node, dict):
node_list = context.get_list(node)
if node_list is not None:
if inlist and not self.allow_lists_of_lists:
# type error: Return value expected
return # type: ignore[return-value]
listref = self._add_list(dataset, graph, context, term, node_list)
if listref:
return listref
else: # expand compacted value
if term and term.type:
if term.type == JSON:
node = self._to_typed_json_value(node)
elif node is None:
# type error: Return value expected
return # type: ignore[return-value]
elif term.type == ID and isinstance(node, str):
node = {ID: context.resolve(node)}
elif term.type == VOCAB and isinstance(node, str):
node = {ID: context.expand(node) or context.resolve_iri(node)}
else:
node = {TYPE: term.type, VALUE: node}
else:
if node is None:
# type error: Return value expected
return # type: ignore[return-value]
if isinstance(node, float):
return Literal(node, datatype=XSD.double)
if term and term.language is not UNDEF:
lang = term.language
else:
lang = context.language
return Literal(node, lang=lang)
lang = context.get_language(node)
datatype = not lang and context.get_type(node) or None
value = context.get_value(node)
# type error: Unsupported operand types for in ("Optional[Any]" and "Generator[str, None, None]")
if datatype in context.get_keys(JSON): # type: ignore[operator]
node = self._to_typed_json_value(value)
datatype = context.get_type(node)
value = context.get_value(node)
if lang or context.get_key(VALUE) in node or VALUE in node:
if value is None:
return None
if lang:
if " " in lang:
# type error: Return value expected
return # type: ignore[return-value]
return Literal(value, lang=lang)
elif datatype:
return Literal(value, datatype=context.expand(datatype))
else:
return Literal(value)
else:
return self._add_to_graph(dataset, graph, context, node)
def _to_rdf_id(self, context: Context, id_val: str) -> Optional[IdentifiedNode]:
bid = self._get_bnodeid(id_val)
if bid:
b = BNode(bid)
if self.skolemize:
return b.skolemize()
return b
else:
uri = context.resolve(id_val)
if not self.generalized_rdf and ":" not in uri:
return None
node: IdentifiedNode = URIRef(uri)
if not str(node):
if id_val not in self.invalid_uri_to_bnode:
self.invalid_uri_to_bnode[id_val] = BNode(secrets.token_urlsafe(20))
node = self.invalid_uri_to_bnode[id_val]
return node
def _get_bnodeid(self, ref: str) -> Optional[str]:
if not ref.startswith("_:"):
# type error: Return value expected
return # type: ignore[return-value]
bid = ref.split("_:", 1)[-1]
return bid or None
def _add_list(
self,
dataset: Graph,
graph: Graph,
context: Context,
term: Optional[Term],
node_list: Any,
) -> IdentifiedNode:
if not isinstance(node_list, list):
node_list = [node_list]
first_subj: Union[URIRef, BNode] = BNode()
if self.skolemize and isinstance(first_subj, BNode):
first_subj = first_subj.skolemize()
rest: Union[URIRef, BNode, None]
subj, rest = first_subj, None
for node in node_list:
if node is None:
continue
if rest:
# type error: Statement is unreachable
graph.add((subj, RDF.rest, rest)) # type: ignore[unreachable]
subj = rest
obj = self._to_object(dataset, graph, context, term, node, inlist=True)
if obj is None:
continue
graph.add((subj, RDF.first, obj))
rest = BNode()
if self.skolemize and isinstance(rest, BNode):
rest = rest.skolemize()
if rest:
graph.add((subj, RDF.rest, RDF.nil))
return first_subj
else:
return RDF.nil
@staticmethod
def _to_typed_json_value(value: Any) -> Dict[str, str]:
if _HAS_ORJSON:
val_string: str = orjson.dumps(
value,
option=orjson.OPT_SORT_KEYS | orjson.OPT_NON_STR_KEYS,
).decode("utf-8")
else:
val_string = json.dumps(
value, separators=(",", ":"), sort_keys=True, ensure_ascii=False
)
return {
TYPE: RDF.JSON,
VALUE: val_string,
}
@classmethod
def _expand_nested_list(cls, obj_nodes: List[Any]) -> Dict[str, List[Any]]:
result = [
cls._expand_nested_list(o) if isinstance(o, list) else o for o in obj_nodes
]
return {LIST: result}
@@ -0,0 +1,133 @@
"""
This is a rdflib plugin for parsing NQuad files into Conjunctive
graphs that can be used and queried. The store that backs the graph
*must* be able to handle contexts.
>>> from rdflib import ConjunctiveGraph, URIRef, Namespace
>>> g = ConjunctiveGraph()
>>> data = open("test/data/nquads.rdflib/example.nquads", "rb")
>>> g.parse(data, format="nquads") # doctest:+ELLIPSIS
<Graph identifier=... (<class 'rdflib.graph.Graph'>)>
>>> assert len(g.store) == 449
>>> # There should be 16 separate contexts
>>> assert len([x for x in g.store.contexts()]) == 16
>>> # is the name of entity E10009 "Arco Publications"?
>>> # (in graph http://bibliographica.org/entity/E10009)
>>> # Looking for:
>>> # <http://bibliographica.org/entity/E10009>
>>> # <http://xmlns.com/foaf/0.1/name>
>>> # "Arco Publications"
>>> # <http://bibliographica.org/entity/E10009>
>>> s = URIRef("http://bibliographica.org/entity/E10009")
>>> FOAF = Namespace("http://xmlns.com/foaf/0.1/")
>>> assert(g.value(s, FOAF.name).eq("Arco Publications"))
"""
from __future__ import annotations
from codecs import getreader
from typing import Any, MutableMapping, Optional
from rdflib.exceptions import ParserError as ParseError
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
from rdflib.parser import InputSource
# Build up from the NTriples parser:
from rdflib.plugins.parsers.ntriples import W3CNTriplesParser, r_tail, r_wspace
from rdflib.term import BNode
__all__ = ["NQuadsParser"]
_BNodeContextType = MutableMapping[str, BNode]
class NQuadsParser(W3CNTriplesParser):
# type error: Signature of "parse" incompatible with supertype "W3CNTriplesParser"
def parse( # type: ignore[override]
self,
inputsource: InputSource,
sink: Graph,
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
**kwargs: Any,
):
"""
Parse inputsource as an N-Quads file.
:type inputsource: `rdflib.parser.InputSource`
:param inputsource: the source of N-Quads-formatted data
:type sink: `rdflib.graph.Graph`
:param sink: where to send parsed triples
:type bnode_context: `dict`, optional
:param bnode_context: a dict mapping blank node identifiers to `~rdflib.term.BNode` instances.
See `.W3CNTriplesParser.parse`
"""
assert (
sink.store.context_aware
), "NQuadsParser must be given a context-aware store."
# Set default_union to True to mimic ConjunctiveGraph behavior
ds = Dataset(store=sink.store, default_union=True)
ds_default = ds.default_context # the DEFAULT_DATASET_GRAPH_ID
new_default_context = None
if isinstance(sink, (Dataset, ConjunctiveGraph)):
new_default_context = sink.default_context
elif sink.identifier is not None:
if sink.identifier == ds_default.identifier:
new_default_context = sink
else:
new_default_context = ds.get_context(sink.identifier)
if new_default_context is not None:
ds.default_context = new_default_context
ds.remove_graph(ds_default) # remove the original unused default graph
# type error: Incompatible types in assignment (expression has type "ConjunctiveGraph", base class "W3CNTriplesParser" defined the type as "Union[DummySink, NTGraphSink]")
self.sink: Dataset = ds # type: ignore[assignment]
self.skolemize = skolemize
source = inputsource.getCharacterStream()
if not source:
source = inputsource.getByteStream()
source = getreader("utf-8")(source)
if not hasattr(source, "read"):
raise ParseError("Item to parse must be a file-like object.")
self.file = source
self.buffer = ""
while True:
self.line = __line = self.readline()
if self.line is None:
break
try:
self.parseline(bnode_context)
except ParseError as msg:
raise ParseError("Invalid line (%s):\n%r" % (msg, __line))
return self.sink
def parseline(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
self.eat(r_wspace)
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
subject = self.subject(bnode_context)
self.eat(r_wspace)
predicate = self.predicate()
self.eat(r_wspace)
obj = self.object(bnode_context)
self.eat(r_wspace)
context = self.uriref() or self.nodeid(bnode_context)
self.eat(r_tail)
if self.line:
raise ParseError("Trailing garbage")
# Must have a context aware store - add on a normal Graph
# discards anything where the ctx != graph.identifier
if context:
self.sink.get_context(context).add((subject, predicate, obj))
else:
self.sink.default_context.add((subject, predicate, obj))
@@ -0,0 +1,385 @@
"""\
N-Triples Parser
License: GPL 2, W3C, BSD, or MIT
Author: Sean B. Palmer, inamidst.com
"""
from __future__ import annotations
import codecs
import re
from io import BytesIO, StringIO, TextIOBase
from typing import (
IO,
TYPE_CHECKING,
Any,
Match,
MutableMapping,
Optional,
Pattern,
TextIO,
Union,
)
from rdflib.compat import _string_escape_map, decodeUnicodeEscape
from rdflib.exceptions import ParserError as ParseError
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode as bNode
from rdflib.term import Literal, URIRef
from rdflib.term import URIRef as URI # noqa: N814
if TYPE_CHECKING:
import typing_extensions as te
from rdflib.graph import Graph, _ObjectType, _PredicateType, _SubjectType
__all__ = [
"unquote",
"uriquote",
"W3CNTriplesParser",
"NTGraphSink",
"NTParser",
"DummySink",
]
uriref = r'<([^:]+:[^\s"<>]*)>'
literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
litinfo = r"(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)|\^\^" + uriref + r")?"
r_line = re.compile(r"([^\r\n]*)(?:\r\n|\r|\n)")
r_wspace = re.compile(r"[ \t]*")
r_wspaces = re.compile(r"[ \t]+")
r_tail = re.compile(r"[ \t]*\.[ \t]*(#.*)?")
r_uriref = re.compile(uriref)
r_nodeid = re.compile(r"_:([A-Za-z0-9_:]([-A-Za-z0-9_:\.]*[-A-Za-z0-9_:])?)")
r_literal = re.compile(literal + litinfo)
bufsiz = 2048
validate = False
class DummySink:
def __init__(self):
self.length = 0
def triple(self, s, p, o):
self.length += 1
print(s, p, o)
r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
r_quot = re.compile(r"""\\([tbnrf"'\\])""")
r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")
def unquote(s: str) -> str:
"""Unquote an N-Triples string."""
if not validate:
if isinstance(s, str): # nquads
s = decodeUnicodeEscape(s)
else:
s = s.decode("unicode-escape") # type: ignore[unreachable]
return s
else:
result = []
while s:
m = r_safe.match(s)
if m:
s = s[m.end() :]
result.append(m.group(1))
continue
m = r_quot.match(s)
if m:
s = s[2:]
result.append(_string_escape_map[m.group(1)])
continue
m = r_uniquot.match(s)
if m:
s = s[m.end() :]
u, U = m.groups() # noqa: N806
codepoint = int(u or U, 16)
if codepoint > 0x10FFFF:
raise ParseError("Disallowed codepoint: %08X" % codepoint)
result.append(chr(codepoint))
elif s.startswith("\\"):
raise ParseError("Illegal escape at: %s..." % s[:10])
else:
raise ParseError("Illegal literal character: %r" % s[0])
return "".join(result)
r_hibyte = re.compile(r"([\x80-\xFF])")
def uriquote(uri: str) -> str:
if not validate:
return uri
else:
return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri)
_BNodeContextType = MutableMapping[str, bNode]
class W3CNTriplesParser:
"""An N-Triples Parser.
This is a legacy-style Triples parser for NTriples provided by W3C
Usage::
p = W3CNTriplesParser(sink=MySink())
sink = p.parse(f) # file; use parsestring for a string
To define a context in which blank node identifiers refer to the same blank node
across instances of NTriplesParser, pass the same dict as ``bnode_context`` to each
instance. By default, a new blank node context is created for each instance of
`W3CNTriplesParser`.
"""
__slots__ = ("_bnode_ids", "sink", "buffer", "file", "line", "skolemize")
def __init__(
self,
sink: Optional[Union[DummySink, NTGraphSink]] = None,
bnode_context: Optional[_BNodeContextType] = None,
):
self.skolemize = False
if bnode_context is not None:
self._bnode_ids = bnode_context
else:
self._bnode_ids = {}
self.sink: Union[DummySink, NTGraphSink]
if sink is not None:
self.sink = sink
else:
self.sink = DummySink()
self.buffer: Optional[str] = None
self.file: Optional[Union[TextIO, codecs.StreamReader]] = None
self.line: Optional[str] = ""
def parse(
self,
f: Union[TextIO, IO[bytes], codecs.StreamReader],
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
) -> Union[DummySink, NTGraphSink]:
"""
Parse f as an N-Triples file.
:type f: :term:`file object`
:param f: the N-Triples source
:type bnode_context: `dict`, optional
:param bnode_context: a dict mapping blank node identifiers (e.g., ``a`` in ``_:a``)
to `~rdflib.term.BNode` instances. An empty dict can be
passed in to define a distinct context for a given call to
`parse`.
"""
if not hasattr(f, "read"):
raise ParseError("Item to parse must be a file-like object.")
if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"):
# someone still using a bytestream here?
f = codecs.getreader("utf-8")(f)
self.skolemize = skolemize
self.file = f # type: ignore[assignment]
self.buffer = ""
while True:
self.line = self.readline()
if self.line is None:
break
try:
self.parseline(bnode_context=bnode_context)
except ParseError:
raise ParseError("Invalid line: {}".format(self.line))
return self.sink
def parsestring(self, s: Union[bytes, bytearray, str], **kwargs) -> None:
"""Parse s as an N-Triples string."""
if not isinstance(s, (str, bytes, bytearray)):
raise ParseError("Item to parse must be a string instance.")
f: Union[codecs.StreamReader, StringIO]
if isinstance(s, (bytes, bytearray)):
f = codecs.getreader("utf-8")(BytesIO(s))
else:
f = StringIO(s)
self.parse(f, **kwargs)
def readline(self) -> Optional[str]:
"""Read an N-Triples line from buffered input."""
# N-Triples lines end in either CRLF, CR, or LF
# Therefore, we can't just use f.readline()
if not self.buffer:
# type error: Item "None" of "Union[TextIO, StreamReader, None]" has no attribute "read"
buffer = self.file.read(bufsiz) # type: ignore[union-attr]
if not buffer:
return None
self.buffer = buffer
while True:
m = r_line.match(self.buffer)
if m: # the more likely prospect
self.buffer = self.buffer[m.end() :]
return m.group(1)
else:
# type error: Item "None" of "Union[TextIO, StreamReader, None]" has no attribute "read"
buffer = self.file.read(bufsiz) # type: ignore[union-attr]
if not buffer and not self.buffer.isspace():
# Last line does not need to be terminated with a newline
buffer += "\n"
elif not buffer:
return None
self.buffer += buffer
def parseline(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
self.eat(r_wspace)
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
subject = self.subject(bnode_context)
self.eat(r_wspaces)
predicate = self.predicate()
self.eat(r_wspaces)
object_ = self.object(bnode_context)
self.eat(r_tail)
if self.line:
raise ParseError("Trailing garbage: {}".format(self.line))
self.sink.triple(subject, predicate, object_)
def peek(self, token: str) -> bool:
return self.line.startswith(token) # type: ignore[union-attr]
def eat(self, pattern: Pattern[str]) -> Match[str]:
m = pattern.match(self.line) # type: ignore[arg-type]
if not m: # @@ Why can't we get the original pattern?
# print(dir(pattern))
# print repr(self.line), type(self.line)
raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
self.line = self.line[m.end() :] # type: ignore[index]
return m
def subject(self, bnode_context=None) -> Union[bNode, URIRef]:
# @@ Consider using dictionary cases
subj = self.uriref() or self.nodeid(bnode_context)
if not subj:
raise ParseError("Subject must be uriref or nodeID")
return subj
def predicate(self) -> Union[bNode, URIRef]:
pred = self.uriref()
if not pred:
raise ParseError("Predicate must be uriref")
return pred
def object(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[URI, bNode, Literal]:
objt = self.uriref() or self.nodeid(bnode_context) or self.literal()
if objt is False:
raise ParseError("Unrecognised object type")
return objt
def uriref(self) -> Union[te.Literal[False], URI]:
if self.peek("<"):
uri = self.eat(r_uriref).group(1)
uri = unquote(uri)
uri = uriquote(uri)
return URI(uri)
return False
def nodeid(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[te.Literal[False], bNode, URI]:
if self.peek("_"):
if self.skolemize:
bnode_id = self.eat(r_nodeid).group(1)
return bNode(bnode_id).skolemize()
else:
# Fix for https://github.com/RDFLib/rdflib/issues/204
if bnode_context is None:
bnode_context = self._bnode_ids
bnode_id = self.eat(r_nodeid).group(1)
new_id = bnode_context.get(bnode_id, None)
if new_id is not None:
# Re-map to id specific to this doc
return bNode(new_id)
else:
# Replace with freshly-generated document-specific BNode id
bnode = bNode()
# Store the mapping
bnode_context[bnode_id] = bnode
return bnode
return False
def literal(self) -> Union[te.Literal[False], Literal]:
if self.peek('"'):
lit, lang, dtype = self.eat(r_literal).groups()
if lang:
lang = lang
else:
lang = None
if dtype:
dtype = unquote(dtype)
dtype = uriquote(dtype)
dtype = URI(dtype)
else:
dtype = None
if lang and dtype:
raise ParseError("Can't have both a language and a datatype")
lit = unquote(lit)
return Literal(lit, lang, dtype)
return False
class NTGraphSink:
__slots__ = ("g",)
def __init__(self, graph: Graph):
self.g = graph
def triple(self, s: _SubjectType, p: _PredicateType, o: _ObjectType) -> None:
self.g.add((s, p, o))
class NTParser(Parser):
"""parser for the ntriples format, often stored with the .nt extension
See http://www.w3.org/TR/rdf-testcases/#ntriples"""
__slots__ = ()
@classmethod
def parse(cls, source: InputSource, sink: Graph, **kwargs: Any) -> None:
"""
Parse the NT format
:type source: `rdflib.parser.InputSource`
:param source: the source of NT-formatted data
:type sink: `rdflib.graph.Graph`
:param sink: where to send parsed triples
:param kwargs: Additional arguments to pass to `.W3CNTriplesParser.parse`
"""
f: Union[TextIO, IO[bytes], codecs.StreamReader]
f = source.getCharacterStream()
if not f:
b = source.getByteStream()
# TextIOBase includes: StringIO and TextIOWrapper
if isinstance(b, TextIOBase):
# f is not really a ByteStream, but a CharacterStream
f = b # type: ignore[assignment]
else:
# since N-Triples 1.1 files can and should be utf-8 encoded
f = codecs.getreader("utf-8")(b)
parser = W3CNTriplesParser(NTGraphSink(sink))
parser.parse(f, **kwargs)
f.close()
@@ -0,0 +1,183 @@
from __future__ import annotations
from codecs import getreader
from enum import Enum
from typing import TYPE_CHECKING, Any, MutableMapping, Optional, Union
from rdflib.exceptions import ParserError as ParseError
from rdflib.graph import Dataset
from rdflib.parser import InputSource
from rdflib.plugins.parsers.nquads import NQuadsParser
# Build up from the NTriples parser:
from rdflib.plugins.parsers.ntriples import r_nodeid, r_tail, r_uriref, r_wspace
from rdflib.term import BNode, URIRef
if TYPE_CHECKING:
import typing_extensions as te
__all__ = ["RDFPatchParser", "Operation"]
_BNodeContextType = MutableMapping[str, BNode]
class Operation(Enum):
"""
Enum of RDF Patch operations.
Operations:
- `AddTripleOrQuad` (A): Adds a triple or quad.
- `DeleteTripleOrQuad` (D): Deletes a triple or quad.
- `AddPrefix` (PA): Adds a prefix.
- `DeletePrefix` (PD): Deletes a prefix.
- `TransactionStart` (TX): Starts a transaction.
- `TransactionCommit` (TC): Commits a transaction.
- `TransactionAbort` (TA): Aborts a transaction.
- `Header` (H): Specifies a header.
"""
AddTripleOrQuad = "A"
DeleteTripleOrQuad = "D"
AddPrefix = "PA"
DeletePrefix = "PD"
TransactionStart = "TX"
TransactionCommit = "TC"
TransactionAbort = "TA"
Header = "H"
class RDFPatchParser(NQuadsParser):
def parse( # type: ignore[override]
self,
inputsource: InputSource,
sink: Dataset,
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
**kwargs: Any,
) -> Dataset:
"""
Parse inputsource as an RDF Patch file.
:type inputsource: `rdflib.parser.InputSource`
:param inputsource: the source of RDF Patch formatted data
:type sink: `rdflib.graph.Dataset`
:param sink: where to send parsed data
:type bnode_context: `dict`, optional
:param bnode_context: a dict mapping blank node identifiers to `~rdflib.term.BNode` instances.
See `.W3CNTriplesParser.parse`
"""
assert sink.store.context_aware, (
"RDFPatchParser must be given" " a context aware store."
)
# type error: Incompatible types in assignment (expression has type "ConjunctiveGraph", base class "W3CNTriplesParser" defined the type as "Union[DummySink, NTGraphSink]")
self.sink: Dataset = Dataset(store=sink.store)
self.skolemize = skolemize
source = inputsource.getCharacterStream()
if not source:
source = inputsource.getByteStream()
source = getreader("utf-8")(source)
if not hasattr(source, "read"):
raise ParseError("Item to parse must be a file-like object.")
self.file = source
self.buffer = ""
while True:
self.line = __line = self.readline()
if self.line is None:
break
try:
self.parsepatch(bnode_context)
except ParseError as msg:
raise ParseError("Invalid line (%s):\n%r" % (msg, __line))
return self.sink
def parsepatch(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
self.eat(r_wspace)
# From spec: "No comments should be included (comments start # and run to end
# of line)."
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
# if header, transaction, skip
operation = self.operation()
self.eat(r_wspace)
if operation in [Operation.AddTripleOrQuad, Operation.DeleteTripleOrQuad]:
self.add_or_remove_triple_or_quad(operation, bnode_context)
elif operation == Operation.AddPrefix:
self.add_prefix()
elif operation == Operation.DeletePrefix:
self.delete_prefix()
def add_or_remove_triple_or_quad(
self, operation, bnode_context: Optional[_BNodeContextType] = None
) -> None:
self.eat(r_wspace)
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
subject = self.labeled_bnode() or self.subject(bnode_context)
self.eat(r_wspace)
predicate = self.predicate()
self.eat(r_wspace)
obj = self.labeled_bnode() or self.object(bnode_context)
self.eat(r_wspace)
context = self.labeled_bnode() or self.uriref() or self.nodeid(bnode_context)
self.eat(r_tail)
if self.line:
raise ParseError("Trailing garbage")
# Must have a context aware store - add on a normal Graph
# discards anything where the ctx != graph.identifier
if operation == Operation.AddTripleOrQuad:
if context:
self.sink.get_context(context).add((subject, predicate, obj))
else:
self.sink.default_context.add((subject, predicate, obj))
elif operation == Operation.DeleteTripleOrQuad:
if context:
self.sink.get_context(context).remove((subject, predicate, obj))
else:
self.sink.default_context.remove((subject, predicate, obj))
def add_prefix(self):
# Extract prefix and URI from the line
prefix, ns, _ = self.line.replace('"', "").replace("'", "").split(" ") # type: ignore[union-attr]
ns_stripped = ns.strip("<>")
self.sink.bind(prefix, ns_stripped)
def delete_prefix(self):
prefix, _, _ = self.line.replace('"', "").replace("'", "").split(" ") # type: ignore[union-attr]
self.sink.namespace_manager.bind(prefix, None, replace=True)
def operation(self) -> Operation:
for op in Operation:
if self.line.startswith(op.value): # type: ignore[union-attr]
self.eat_op(op.value)
return op
raise ValueError(
f'Invalid or no Operation found in line: "{self.line}". Valid Operations '
f"codes are {', '.join([op.value for op in Operation])}"
)
def eat_op(self, op: str) -> None:
self.line = self.line.lstrip(op) # type: ignore[union-attr]
def nodeid(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[te.Literal[False], BNode, URIRef]:
if self.peek("_"):
return BNode(self.eat(r_nodeid).group(1))
return False
def labeled_bnode(self):
if self.peek("<_"):
plain_uri = self.eat(r_uriref).group(1)
bnode_id = r_nodeid.match(plain_uri).group(1) # type: ignore[union-attr]
return BNode(bnode_id)
return False
@@ -0,0 +1,651 @@
"""
An RDF/XML parser for RDFLib
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
from urllib.parse import urldefrag, urljoin
from xml.sax import handler, make_parser, xmlreader
from xml.sax.handler import ErrorHandler
from xml.sax.saxutils import escape, quoteattr
from rdflib.exceptions import Error, ParserError
from rdflib.graph import Graph
from rdflib.namespace import RDF, is_ncname
from rdflib.parser import InputSource, Parser
from rdflib.plugins.parsers.RDFVOC import RDFVOC
from rdflib.term import BNode, Identifier, Literal, URIRef
if TYPE_CHECKING:
# from xml.sax.expatreader import ExpatLocator
from xml.sax.xmlreader import AttributesImpl, Locator
from rdflib.graph import _ObjectType, _SubjectType, _TripleType
__all__ = ["create_parser", "BagID", "ElementHandler", "RDFXMLHandler", "RDFXMLParser"]
RDFNS = RDFVOC
# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
# A mapping from unqualified terms to their qualified version.
UNQUALIFIED = {
"about": RDFVOC.about,
"ID": RDFVOC.ID,
"type": RDFVOC.type,
"resource": RDFVOC.resource,
"parseType": RDFVOC.parseType,
}
# http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms
CORE_SYNTAX_TERMS = [
RDFVOC.RDF,
RDFVOC.ID,
RDFVOC.about,
RDFVOC.parseType,
RDFVOC.resource,
RDFVOC.nodeID,
RDFVOC.datatype,
]
# http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms
SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDFVOC.Description, RDFVOC.li]
# http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms
OLD_TERMS = [
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"),
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"),
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"),
]
NODE_ELEMENT_EXCEPTIONS = (
CORE_SYNTAX_TERMS
+ [
RDFVOC.li,
]
+ OLD_TERMS
)
NODE_ELEMENT_ATTRIBUTES = [RDFVOC.ID, RDFVOC.nodeID, RDFVOC.about]
PROPERTY_ELEMENT_EXCEPTIONS = (
CORE_SYNTAX_TERMS
+ [
RDFVOC.Description,
]
+ OLD_TERMS
)
PROPERTY_ATTRIBUTE_EXCEPTIONS = (
CORE_SYNTAX_TERMS + [RDFVOC.Description, RDFVOC.li] + OLD_TERMS
)
PROPERTY_ELEMENT_ATTRIBUTES = [RDFVOC.ID, RDFVOC.resource, RDFVOC.nodeID]
XMLNS = "http://www.w3.org/XML/1998/namespace"
BASE = (XMLNS, "base")
LANG = (XMLNS, "lang")
class BagID(URIRef):
__slots__ = ["li"]
def __init__(self, val):
# type error: Too many arguments for "__init__" of "object"
super(URIRef, self).__init__(val) # type: ignore[call-arg]
self.li = 0
def next_li(self):
self.li += 1
# type error: Type expected within [...]
return RDFNS["_%s" % self.li] # type: ignore[misc]
class ElementHandler:
__slots__ = [
"start",
"char",
"end",
"li",
"id",
"base",
"subject",
"predicate",
"object",
"list",
"language",
"datatype",
"declared",
"data",
]
def __init__(self):
self.start = None
self.char = None
self.end = None
self.li = 0
self.id = None
self.base = None
self.subject = None
self.object = None
self.list = None
self.language = None
self.datatype = None
self.declared = None
self.data = None
def next_li(self):
self.li += 1
return RDFVOC["_%s" % self.li]
class RDFXMLHandler(handler.ContentHandler):
def __init__(self, store: Graph):
self.store = store
self.preserve_bnode_ids = False
self.reset()
def reset(self) -> None:
document_element = ElementHandler()
document_element.start = self.document_element_start
document_element.end = lambda name, qname: None
self.stack: List[Optional[ElementHandler]] = [
None,
document_element,
]
self.ids: Dict[str, int] = {} # remember IDs we have already seen
self.bnode: Dict[str, Identifier] = {}
self._ns_contexts: List[Dict[str, Optional[str]]] = [
{}
] # contains uri -> prefix dicts
self._current_context: Dict[str, Optional[str]] = self._ns_contexts[-1]
# ContentHandler methods
def setDocumentLocator(self, locator: Locator):
self.locator = locator
def startDocument(self) -> None:
pass
def startPrefixMapping(self, prefix: Optional[str], namespace: str) -> None:
self._ns_contexts.append(self._current_context.copy())
self._current_context[namespace] = prefix
self.store.bind(prefix, namespace or "", override=False)
def endPrefixMapping(self, prefix: Optional[str]) -> None:
self._current_context = self._ns_contexts[-1]
del self._ns_contexts[-1]
def startElementNS(
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
) -> None:
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
# type error: No overlaod for "get" of "AttributesImpl" mactches tuple (str, str)
base = attrs.get(BASE, None) # type: ignore[call-overload, unused-ignore]
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
# type error: No overlaod for "get" of "AttributesImpl" mactches tuple (str, str)
language = attrs.get(LANG, None) # type: ignore[call-overload, unused-ignore]
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
def endElementNS(self, name: Tuple[Optional[str], str], qname) -> None:
self.current.end(name, qname)
self.stack.pop()
def characters(self, content: str) -> None:
char = self.current.char
if char:
char(content)
def ignorableWhitespace(self, content) -> None:
pass
def processingInstruction(self, target, data) -> None:
pass
def add_reified(self, sid: Identifier, spo: _TripleType):
s, p, o = spo
self.store.add((sid, RDF.type, RDF.Statement))
self.store.add((sid, RDF.subject, s))
self.store.add((sid, RDF.predicate, p))
self.store.add((sid, RDF.object, o))
def error(self, message: str) -> NoReturn:
locator = self.locator
info = "%s:%s:%s: " % (
locator.getSystemId(),
locator.getLineNumber(),
locator.getColumnNumber(),
)
raise ParserError(info + message)
def get_current(self) -> Optional[ElementHandler]:
return self.stack[-2]
# Create a read only property called current so that self.current
# give the current element handler.
current = property(get_current)
def get_next(self) -> Optional[ElementHandler]:
return self.stack[-1]
# Create a read only property that gives the element handler to be
# used for the next element.
next = property(get_next)
def get_parent(self) -> Optional[ElementHandler]:
return self.stack[-3]
# Create a read only property that gives the current parent
# element handler
parent = property(get_parent)
def absolutize(self, uri: str) -> URIRef:
# type error: Argument "allow_fragments" to "urljoin" has incompatible type "int"; expected "bool"
result = urljoin(self.current.base, uri, allow_fragments=1) # type: ignore[arg-type]
if uri and uri[-1] == "#" and result[-1] != "#":
result = "%s#" % result
return URIRef(result)
def convert(
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
) -> Tuple[URIRef, Dict[URIRef, str]]:
if name[0] is None:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[Optional[str], str]")
name = URIRef(name[1]) # type: ignore[assignment]
else:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[Optional[str], str]")
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Optional[str], str]"; expected "Iterable[str]"
name = URIRef("".join(name)) # type: ignore[assignment, arg-type]
atts = {}
for n, v in attrs.items():
# mypy error: mypy thinks n[0]==None is unreachable
if n[0] is None:
att = n[1] # type: ignore[unreachable, unused-ignore]
else:
att = "".join(n)
if att.startswith(XMLNS) or att[0:3].lower() == "xml":
pass
elif att in UNQUALIFIED:
# if not RDFNS[att] in atts:
# type error: Variable "att" is not valid as a type
atts[RDFNS[att]] = v # type: ignore[misc, valid-type]
else:
atts[URIRef(att)] = v
# type error: Incompatible return value type (got "Tuple[Tuple[Optional[str], str], Dict[Any, Any]]", expected "Tuple[URIRef, Dict[URIRef, str]]")
return name, atts # type: ignore[return-value]
def document_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
if name[0] and URIRef("".join(name)) == RDFVOC.RDF:
next = self.next
next.start = self.node_element_start
next.end = self.node_element_end
else:
self.node_element_start(name, qname, attrs)
# self.current.end = self.node_element_end
# TODO... set end to something that sets start such that
# another element will cause error
def node_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[str, str]")
name, atts = self.convert(name, qname, attrs) # type: ignore[assignment]
current = self.current
absolutize = self.absolutize
next = self.next
next.start = self.property_element_start
next.end = self.property_element_end
if name in NODE_ELEMENT_EXCEPTIONS:
# type error: Not all arguments converted during string formatting
self.error("Invalid node element URI: %s" % name) # type: ignore[str-format]
subject: _SubjectType
if RDFVOC.ID in atts:
if RDFVOC.about in atts or RDFVOC.nodeID in atts:
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
id = atts[RDFVOC.ID]
if not is_ncname(id):
self.error("rdf:ID value is not a valid NCName: %s" % id)
subject = absolutize("#%s" % id)
if subject in self.ids:
self.error("two elements cannot use the same ID: '%s'" % subject)
self.ids[subject] = 1 # IDs can only appear once within a document
elif RDFVOC.nodeID in atts:
if RDFVOC.ID in atts or RDFVOC.about in atts:
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
nodeID = atts[RDFVOC.nodeID]
if not is_ncname(nodeID):
self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID)
if self.preserve_bnode_ids is False:
if nodeID in self.bnode:
subject = self.bnode[nodeID]
else:
subject = BNode()
self.bnode[nodeID] = subject
else:
subject = BNode(nodeID)
elif RDFVOC.about in atts:
if RDFVOC.ID in atts or RDFVOC.nodeID in atts:
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
subject = absolutize(atts[RDFVOC.about])
else:
subject = BNode()
if name != RDFVOC.Description: # S1
# error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
self.store.add((subject, RDF.type, absolutize(name))) # type: ignore[arg-type]
object: _ObjectType
language = current.language
for att in atts:
if not att.startswith(str(RDFNS)):
predicate = absolutize(att)
try:
object = Literal(atts[att], language)
except Error as e:
# type error: Argument 1 to "error" of "RDFXMLHandler" has incompatible type "Optional[str]"; expected "str"
self.error(e.msg) # type: ignore[arg-type]
elif att == RDF.type: # S2
predicate = RDF.type
object = absolutize(atts[RDF.type])
elif att in NODE_ELEMENT_ATTRIBUTES:
continue
elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: # S3
self.error("Invalid property attribute URI: %s" % att)
# type error: Statement is unreachable
continue # type: ignore[unreachable] # for when error does not throw an exception
else:
predicate = absolutize(att)
try:
object = Literal(atts[att], language)
except Error as e:
# type error: Argument 1 to "error" of "RDFXMLHandler" has incompatible type "Optional[str]"; expected "str"
self.error(e.msg) # type: ignore[arg-type]
self.store.add((subject, predicate, object))
current.subject = subject
def node_element_end(self, name: Tuple[str, str], qname) -> None:
# repeat node-elements are only allowed
# at at top-level
if self.parent.object and self.current != self.stack[2]:
self.error(
"Repeat node-elements inside property elements: %s" % "".join(name)
)
self.parent.object = self.current.subject
def property_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[str, str]")
name, atts = self.convert(name, qname, attrs) # type: ignore[assignment]
current = self.current
absolutize = self.absolutize
next = self.next
object: Optional[_ObjectType] = None
current.data = None
current.list = None
# type error: "Tuple[str, str]" has no attribute "startswith"
if not name.startswith(str(RDFNS)): # type: ignore[attr-defined]
# type error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
current.predicate = absolutize(name) # type: ignore[arg-type]
elif name == RDFVOC.li:
current.predicate = current.next_li()
elif name in PROPERTY_ELEMENT_EXCEPTIONS:
# type error: Not all arguments converted during string formatting
self.error("Invalid property element URI: %s" % name) # type: ignore[str-format]
else:
# type error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
current.predicate = absolutize(name) # type: ignore[arg-type]
id = atts.get(RDFVOC.ID, None)
if id is not None:
if not is_ncname(id):
self.error("rdf:ID value is not a value NCName: %s" % id)
current.id = absolutize("#%s" % id)
else:
current.id = None
resource = atts.get(RDFVOC.resource, None)
nodeID = atts.get(RDFVOC.nodeID, None)
parse_type = atts.get(RDFVOC.parseType, None)
if resource is not None and nodeID is not None:
self.error("Property element cannot have both rdf:nodeID and rdf:resource")
if resource is not None:
object = absolutize(resource)
next.start = self.node_element_start
next.end = self.node_element_end
elif nodeID is not None:
if not is_ncname(nodeID):
self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID)
if self.preserve_bnode_ids is False:
if nodeID in self.bnode:
object = self.bnode[nodeID]
else:
subject = BNode()
self.bnode[nodeID] = subject
object = subject
else:
object = subject = BNode(nodeID)
next.start = self.node_element_start
next.end = self.node_element_end
else:
if parse_type is not None:
for att in atts:
if att != RDFVOC.parseType and att != RDFVOC.ID:
self.error("Property attr '%s' now allowed here" % att)
if parse_type == "Resource":
current.subject = object = BNode()
current.char = self.property_element_char
next.start = self.property_element_start
next.end = self.property_element_end
elif parse_type == "Collection":
current.char = None
object = current.list = RDF.nil # BNode()
# self.parent.subject
next.start = self.node_element_start
next.end = self.list_node_element_end
else: # if parse_type=="Literal":
# All other values are treated as Literal
# See: http://www.w3.org/TR/rdf-syntax-grammar/
# parseTypeOtherPropertyElt
object = Literal("", datatype=RDFVOC.XMLLiteral)
current.char = self.literal_element_char
current.declared = {XMLNS: "xml"}
next.start = self.literal_element_start
next.char = self.literal_element_char
next.end = self.literal_element_end
current.object = object
return
else:
object = None
current.char = self.property_element_char
next.start = self.node_element_start
next.end = self.node_element_end
datatype = current.datatype = atts.get(RDFVOC.datatype, None)
language = current.language
if datatype is not None:
# TODO: check that there are no atts other than datatype and id
datatype = absolutize(datatype)
else:
for att in atts:
if not att.startswith(str(RDFNS)):
predicate = absolutize(att)
elif att in PROPERTY_ELEMENT_ATTRIBUTES:
continue
elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS:
self.error("""Invalid property attribute URI: %s""" % att)
else:
predicate = absolutize(att)
o: _ObjectType
if att == RDF.type:
o = URIRef(atts[att])
else:
if datatype is not None:
# type error: Statement is unreachable
language = None # type: ignore[unreachable]
o = Literal(atts[att], language, datatype)
if object is None:
object = BNode()
self.store.add((object, predicate, o))
if object is None:
current.data = ""
current.object = None
else:
current.data = None
current.object = object
def property_element_char(self, data: str) -> None:
current = self.current
if current.data is not None:
current.data += data
def property_element_end(self, name: Tuple[str, str], qname) -> None:
current = self.current
if current.data is not None and current.object is None:
literalLang = current.language
if current.datatype is not None:
literalLang = None
current.object = Literal(current.data, literalLang, current.datatype)
current.data = None
if self.next.end == self.list_node_element_end:
if current.object != RDF.nil:
self.store.add((current.list, RDF.rest, RDF.nil))
if current.object is not None:
self.store.add((self.parent.subject, current.predicate, current.object))
if current.id is not None:
self.add_reified(
current.id, (self.parent.subject, current.predicate, current.object)
)
current.subject = None
def list_node_element_end(self, name: Tuple[str, str], qname) -> None:
current = self.current
if self.parent.list == RDF.nil:
list = BNode()
# Removed between 20030123 and 20030905
# self.store.add((list, RDF.type, LIST))
self.parent.list = list
self.store.add((self.parent.list, RDF.first, current.subject))
self.parent.object = list
self.parent.char = None
else:
list = BNode()
# Removed between 20030123 and 20030905
# self.store.add((list, RDF.type, LIST))
self.store.add((self.parent.list, RDF.rest, list))
self.store.add((list, RDF.first, current.subject))
self.parent.list = list
def literal_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
current = self.current
self.next.start = self.literal_element_start
self.next.char = self.literal_element_char
self.next.end = self.literal_element_end
current.declared = self.parent.declared.copy()
if name[0]:
prefix = self._current_context[name[0]]
if prefix:
current.object = "<%s:%s" % (prefix, name[1])
else:
current.object = "<%s" % name[1]
if not name[0] in current.declared: # noqa: E713
current.declared[name[0]] = prefix
if prefix:
current.object += ' xmlns:%s="%s"' % (prefix, name[0])
else:
current.object += ' xmlns="%s"' % name[0]
else:
current.object = "<%s" % name[1]
# type error: Incompatible types in assignment (expression has type "str", variable has type "Tuple[str, str]")
for name, value in attrs.items(): # type: ignore[assignment, unused-ignore]
if name[0]:
if not name[0] in current.declared: # noqa: E713
current.declared[name[0]] = self._current_context[name[0]]
name = current.declared[name[0]] + ":" + name[1]
else:
# type error: Incompatible types in assignment (expression has type "str", variable has type "Tuple[str, str]")
name = name[1] # type: ignore[assignment]
current.object += " %s=%s" % (name, quoteattr(value))
current.object += ">"
def literal_element_char(self, data: str) -> None:
self.current.object += escape(data)
def literal_element_end(self, name: Tuple[str, str], qname) -> None:
if name[0]:
prefix = self._current_context[name[0]]
if prefix:
end = "</%s:%s>" % (prefix, name[1])
else:
end = "</%s>" % name[1]
else:
end = "</%s>" % name[1]
self.parent.object += self.current.object + end
def create_parser(target: InputSource, store: Graph) -> xmlreader.XMLReader:
parser = make_parser()
try:
# Workaround for bug in expatreader.py. Needed when
# expatreader is trying to guess a prefix.
parser.start_namespace_decl("xml", "http://www.w3.org/XML/1998/namespace") # type: ignore[attr-defined]
except AttributeError:
pass # Not present in Jython (at least)
parser.setFeature(handler.feature_namespaces, 1)
rdfxml = RDFXMLHandler(store)
# type error: Argument 1 to "setDocumentLocator" of "RDFXMLHandler" has incompatible type "InputSource"; expected "Locator"
rdfxml.setDocumentLocator(target) # type: ignore[arg-type]
# rdfxml.setDocumentLocator(_Locator(self.url, self.parser))
parser.setContentHandler(rdfxml)
parser.setErrorHandler(ErrorHandler())
return parser
class RDFXMLParser(Parser):
def __init__(self):
pass
def parse(self, source: InputSource, sink: Graph, **args: Any) -> None:
self._parser = create_parser(source, sink)
content_handler = self._parser.getContentHandler()
preserve_bnode_ids = args.get("preserve_bnode_ids", None)
if preserve_bnode_ids is not None:
# type error: ContentHandler has no attribute "preserve_bnode_ids"
content_handler.preserve_bnode_ids = preserve_bnode_ids # type: ignore[attr-defined, unused-ignore]
# # We're only using it once now
# content_handler.reset()
# self._parser.reset()
self._parser.parse(source)
@@ -0,0 +1,177 @@
from __future__ import annotations
from typing import Any, MutableSequence
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.parser import InputSource, Parser
from .notation3 import RDFSink, SinkParser
def becauseSubGraph(*args, **kwargs): # noqa: N802
pass
class TrigSinkParser(SinkParser):
def directiveOrStatement(self, argstr: str, h: int) -> int: # noqa: N802
# import pdb; pdb.set_trace()
i = self.skipSpace(argstr, h)
if i < 0:
return i # EOF
j = self.graph(argstr, i)
if j >= 0:
return j
j = self.sparqlDirective(argstr, i)
if j >= 0:
return j
j = self.directive(argstr, i)
if j >= 0:
return self.checkDot(argstr, j)
j = self.statement(argstr, i)
if j >= 0:
return self.checkDot(argstr, j)
return j
def labelOrSubject( # noqa: N802
self, argstr: str, i: int, res: MutableSequence[Any]
) -> int:
j = self.skipSpace(argstr, i)
if j < 0:
return j # eof
i = j
j = self.uri_ref2(argstr, i, res)
if j >= 0:
return j
if argstr[i] == "[":
j = self.skipSpace(argstr, i + 1)
if j < 0:
self.BadSyntax(argstr, i, "Expected ] got EOF")
if argstr[j] == "]":
res.append(self.blankNode())
return j + 1
return -1
def graph(self, argstr: str, i: int) -> int:
"""
Parse trig graph, i.e.
<urn:graphname> = { .. triples .. }
return -1 if it doesn't look like a graph-decl
raise Exception if it looks like a graph, but isn't.
"""
need_graphid = False
# import pdb; pdb.set_trace()
j = self.sparqlTok("GRAPH", argstr, i) # optional GRAPH keyword
if j >= 0:
i = j
need_graphid = True
r: MutableSequence[Any] = []
j = self.labelOrSubject(argstr, i, r)
if j >= 0:
graph = r[0]
i = j
elif need_graphid:
self.BadSyntax(argstr, i, "GRAPH keyword must be followed by graph name")
else:
graph = self._store.graph.identifier # hack
j = self.skipSpace(argstr, i)
if j < 0:
self.BadSyntax(argstr, i, "EOF found when expected graph")
if argstr[j : j + 1] == "=": # optional = for legacy support
i = self.skipSpace(argstr, j + 1)
if i < 0:
self.BadSyntax(argstr, i, "EOF found when expecting '{'")
else:
i = j
if argstr[i : i + 1] != "{":
return -1 # the node wasn't part of a graph
j = i + 1
if self._context is not None:
self.BadSyntax(argstr, i, "Nested graphs are not allowed")
oldParentContext = self._parentContext # noqa: N806
self._parentContext = self._context
reason2 = self._reason2
self._reason2 = becauseSubGraph
# type error: Incompatible types in assignment (expression has type "Graph", variable has type "Optional[Formula]")
self._context = self._store.newGraph(graph) # type: ignore[assignment]
while 1:
i = self.skipSpace(argstr, j)
if i < 0:
self.BadSyntax(argstr, i, "needed '}', found end.")
if argstr[i : i + 1] == "}":
j = i + 1
break
j = self.directiveOrStatement(argstr, i)
if j < 0:
self.BadSyntax(argstr, i, "expected statement or '}'")
self._context = self._parentContext
self._reason2 = reason2
self._parentContext = oldParentContext
# res.append(subj.close()) # No use until closed
return j
class TrigParser(Parser):
"""
An RDFLib parser for TriG
"""
def __init__(self):
pass
def parse(self, source: InputSource, graph: Graph, encoding: str = "utf-8") -> None:
if encoding not in [None, "utf-8"]:
raise Exception(
# type error: Unsupported left operand type for % ("Tuple[str, str]")
("TriG files are always utf-8 encoded, ", "I was passed: %s") # type: ignore[operator]
% encoding
)
# we're currently being handed a Graph, not a ConjunctiveGraph
assert graph.store.context_aware, "TriG Parser needs a context-aware store!"
conj_graph = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
conj_graph.default_context = graph # TODO: CG __init__ should have a
# default_context arg
# TODO: update N3Processor so that it can use conj_graph as the sink
conj_graph.namespace_manager = graph.namespace_manager
sink = RDFSink(conj_graph)
baseURI = conj_graph.absolutize( # noqa: N806
source.getPublicId() or source.getSystemId() or ""
)
p = TrigSinkParser(sink, baseURI=baseURI, turtle=True)
stream = source.getCharacterStream() # try to get str stream first
if not stream:
# fallback to get the bytes stream
stream = source.getByteStream()
p.loadStream(stream)
for prefix, namespace in p._bindings.items():
conj_graph.bind(prefix, namespace)
# return ???
@@ -0,0 +1,296 @@
"""
A TriX parser for RDFLib
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
from xml.sax import handler, make_parser
from xml.sax.handler import ErrorHandler
from rdflib.exceptions import ParserError
from rdflib.graph import Graph
from rdflib.namespace import Namespace
from rdflib.parser import InputSource, Parser
from rdflib.store import Store
from rdflib.term import BNode, Identifier, Literal, URIRef
if TYPE_CHECKING:
# from xml.sax.expatreader import ExpatLocator
from xml.sax.xmlreader import AttributesImpl, Locator, XMLReader
__all__ = ["create_parser", "TriXHandler", "TriXParser"]
TRIXNS = Namespace("http://www.w3.org/2004/03/trix/trix-1/")
XMLNS = Namespace("http://www.w3.org/XML/1998/namespace")
class TriXHandler(handler.ContentHandler):
"""An Sax Handler for TriX. See http://sw.nokia.com/trix/"""
lang: Optional[str]
datatype: Optional[str]
def __init__(self, store: Store):
self.store = store
self.preserve_bnode_ids = False
self.reset()
def reset(self) -> None:
self.bnode: Dict[str, BNode] = {}
self.graph: Optional[Graph] = None
self.triple: Optional[List[Identifier]] = None
self.state = 0
self.lang = None
self.datatype = None
# ContentHandler methods
def setDocumentLocator(self, locator: Locator):
self.locator = locator
def startDocument(self) -> None:
pass
def startPrefixMapping(self, prefix: Optional[str], namespace: str) -> None:
pass
def endPrefixMapping(self, prefix: Optional[str]) -> None:
pass
def startElementNS(
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
) -> None:
if name[0] != str(TRIXNS):
self.error(
"Only elements in the TriX namespace are allowed. %s!=%s"
% (name[0], TRIXNS)
)
if name[1].lower() == "trix":
if self.state == 0:
self.state = 1
else:
self.error("Unexpected TriX element")
elif name[1] == "graph":
if self.state == 1:
self.state = 2
else:
self.error("Unexpected graph element")
elif name[1] == "uri":
if self.state == 2:
# the context uri
self.state = 3
elif self.state == 4:
# part of a triple
pass
else:
self.error("Unexpected uri element")
elif name[1] == "triple":
if self.state == 2:
if self.graph is None:
# anonymous graph, create one with random bnode id
self.graph = Graph(store=self.store)
# start of a triple
self.triple = []
self.state = 4
else:
self.error("Unexpected triple element")
elif name[1] == "typedLiteral":
if self.state == 4:
# part of triple
self.lang = None
self.datatype = None
try:
self.lang = attrs.getValue((str(XMLNS), "lang")) # type: ignore[arg-type, unused-ignore]
except Exception:
# language not required - ignore
pass
try:
self.datatype = attrs.getValueByQName("datatype") # type: ignore[arg-type, unused-ignore]
except KeyError:
self.error("No required attribute 'datatype'")
else:
self.error("Unexpected typedLiteral element")
elif name[1] == "plainLiteral":
if self.state == 4:
# part of triple
self.lang = None
self.datatype = None
try:
# type error: Argument 1 to "getValue" of "AttributesImpl" has incompatible type "Tuple[str, str]"; expected "str"
self.lang = attrs.getValue((str(XMLNS), "lang")) # type: ignore[arg-type, unused-ignore]
except Exception:
# language not required - ignore
pass
else:
self.error("Unexpected plainLiteral element")
elif name[1] == "id":
if self.state == 2:
# the context uri
self.state = 3
elif self.state == 4:
# part of triple
pass
else:
self.error("Unexpected id element")
else:
self.error("Unknown element %s in TriX namespace" % name[1])
self.chars = ""
def endElementNS(self, name: Tuple[Optional[str], str], qname) -> None:
if TYPE_CHECKING:
assert self.triple is not None
if name[0] != str(TRIXNS):
self.error(
"Only elements in the TriX namespace are allowed. %s!=%s"
% (name[0], TRIXNS)
)
if name[1] == "uri":
if self.state == 3:
self.graph = Graph(
store=self.store, identifier=URIRef(self.chars.strip())
)
self.state = 2
elif self.state == 4:
self.triple += [URIRef(self.chars.strip())]
else:
self.error(
"Illegal internal self.state - This should never "
+ "happen if the SAX parser ensures XML syntax correctness"
)
elif name[1] == "id":
if self.state == 3:
self.graph = Graph(
self.store, identifier=self.get_bnode(self.chars.strip())
)
self.state = 2
elif self.state == 4:
self.triple += [self.get_bnode(self.chars.strip())]
else:
self.error(
"Illegal internal self.state - This should never "
+ "happen if the SAX parser ensures XML syntax correctness"
)
elif name[1] == "plainLiteral" or name[1] == "typedLiteral":
if self.state == 4:
self.triple += [
Literal(self.chars, lang=self.lang, datatype=self.datatype)
]
else:
self.error(
"This should never happen if the SAX parser "
+ "ensures XML syntax correctness"
)
elif name[1] == "triple":
if self.state == 4:
if len(self.triple) != 3:
self.error(
"Triple has wrong length, got %d elements: %s"
% (len(self.triple), self.triple)
)
# type error: Item "None" of "Optional[Graph]" has no attribute "add"
# type error: Argument 1 to "add" of "Graph" has incompatible type "List[Identifier]"; expected "Tuple[Node, Node, Node]"
self.graph.add(self.triple) # type: ignore[union-attr, arg-type]
# self.store.store.add(self.triple,context=self.graph)
# self.store.addN([self.triple+[self.graph]])
self.state = 2
else:
self.error(
"This should never happen if the SAX parser "
+ "ensures XML syntax correctness"
)
elif name[1] == "graph":
self.graph = None
self.state = 1
elif name[1].lower() == "trix":
self.state = 0
else:
self.error("Unexpected close element")
def get_bnode(self, label: str) -> BNode:
if self.preserve_bnode_ids:
bn = BNode(label)
else:
if label in self.bnode:
bn = self.bnode[label]
else:
bn = BNode(label)
self.bnode[label] = bn
return bn
def characters(self, content: str) -> None:
self.chars += content
def ignorableWhitespace(self, content) -> None:
pass
def processingInstruction(self, target, data) -> None:
pass
def error(self, message: str) -> NoReturn:
locator = self.locator
info = "%s:%s:%s: " % (
locator.getSystemId(),
locator.getLineNumber(),
locator.getColumnNumber(),
)
raise ParserError(info + message)
def create_parser(store: Store) -> XMLReader:
parser = make_parser()
try:
# Workaround for bug in expatreader.py. Needed when
# expatreader is trying to guess a prefix.
# type error: "XMLReader" has no attribute "start_namespace_decl"
parser.start_namespace_decl("xml", "http://www.w3.org/XML/1998/namespace") # type: ignore[attr-defined]
except AttributeError:
pass # Not present in Jython (at least)
parser.setFeature(handler.feature_namespaces, 1)
trix = TriXHandler(store)
parser.setContentHandler(trix)
parser.setErrorHandler(ErrorHandler())
return parser
class TriXParser(Parser):
"""A parser for TriX. See http://sw.nokia.com/trix/"""
def __init__(self):
pass
def parse(self, source: InputSource, sink: Graph, **args: Any) -> None:
assert (
sink.store.context_aware
), "TriXParser must be given a context aware store."
self._parser = create_parser(sink.store)
content_handler = self._parser.getContentHandler()
preserve_bnode_ids = args.get("preserve_bnode_ids", None)
if preserve_bnode_ids is not None:
# type error: ContentHandler has no attribute "preserve_bnode_ids"
content_handler.preserve_bnode_ids = preserve_bnode_ids # type: ignore[attr-defined, unused-ignore]
# We're only using it once now
# content_handler.reset()
# self._parser.reset()
self._parser.parse(source)
@@ -0,0 +1,207 @@
"""
HextuplesSerializer RDF graph serializer for RDFLib.
See <https://github.com/ontola/hextuples> for details about the format.
"""
from __future__ import annotations
import json
import warnings
from typing import IO, Any, Callable, List, Optional, Type, Union, cast
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, ConjunctiveGraph, Dataset, Graph
from rdflib.namespace import RDF, XSD
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Literal, URIRef
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
__all__ = ["HextuplesSerializer"]
class HextuplesSerializer(Serializer):
"""
Serializes RDF graphs to NTriples format.
"""
contexts: List[Union[Graph, IdentifiedNode]]
dumps: Callable
def __new__(cls, store: Union[Graph, Dataset, ConjunctiveGraph]):
if _HAS_ORJSON:
cls.str_local_id: Union[str, Any] = orjson.Fragment(b'"localId"')
cls.str_global_id: Union[str, Any] = orjson.Fragment(b'"globalId"')
cls.empty: Union[str, Any] = orjson.Fragment(b'""')
cls.lang_str: Union[str, Any] = orjson.Fragment(
b'"' + RDF.langString.encode("utf-8") + b'"'
)
cls.xsd_string: Union[str, Any] = orjson.Fragment(
b'"' + XSD.string.encode("utf-8") + b'"'
)
else:
cls.str_local_id = "localId"
cls.str_global_id = "globalId"
cls.empty = ""
cls.lang_str = f"{RDF.langString}"
cls.xsd_string = f"{XSD.string}"
return super(cls, cls).__new__(cls)
def __init__(self, store: Union[Graph, Dataset, ConjunctiveGraph]):
self.default_context: Optional[Union[Graph, IdentifiedNode]]
self.graph_type: Union[Type[Graph], Type[Dataset], Type[ConjunctiveGraph]]
if isinstance(store, (Dataset, ConjunctiveGraph)):
self.graph_type = (
Dataset if isinstance(store, Dataset) else ConjunctiveGraph
)
self.contexts = list(store.contexts())
if store.default_context:
self.default_context = store.default_context
self.contexts.append(store.default_context)
else:
self.default_context = None
else:
self.graph_type = Graph
self.contexts = [store]
self.default_context = None
Serializer.__init__(self, store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = "utf-8",
**kwargs: Any,
) -> None:
if base is not None:
warnings.warn(
"base has no meaning for Hextuples serialization. "
"I will ignore this value"
)
if encoding not in [None, "utf-8"]:
warnings.warn(
f"Hextuples files are always utf-8 encoded. "
f"I was passed: {encoding}, "
"but I'm still going to use utf-8 anyway!"
)
if self.store.formula_aware is True:
raise Exception(
"Hextuple serialization can't (yet) handle formula-aware stores"
)
context: Union[Graph, IdentifiedNode]
context_str: Union[bytes, str]
for context in self.contexts:
for triple in context:
# Generate context string just once, because it doesn't change
# for every triple in this context
context_str = cast(
Union[str, bytes],
(
self.empty
if self.graph_type is Graph
else (
orjson.Fragment('"' + self._context_str(context) + '"')
if _HAS_ORJSON
else self._context_str(context)
)
),
)
hl = self._hex_line(triple, context_str)
if hl is not None:
stream.write(hl if _HAS_ORJSON else hl.encode())
def _hex_line(self, triple, context_str: Union[bytes, str]):
if isinstance(
triple[0], (URIRef, BNode)
): # exclude QuotedGraph and other objects
# value
value = (
triple[2]
if isinstance(triple[2], Literal)
else self._iri_or_bn(triple[2])
)
# datatype
if isinstance(triple[2], URIRef):
# datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#namedNode"
datatype = self.str_global_id
elif isinstance(triple[2], BNode):
# datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#blankNode"
datatype = self.str_local_id
elif isinstance(triple[2], Literal):
if triple[2].datatype is not None:
datatype = f"{triple[2].datatype}"
else:
if triple[2].language is not None: # language
datatype = self.lang_str
else:
datatype = self.xsd_string
else:
return None # can't handle non URI, BN or Literal Object (QuotedGraph)
# language
if isinstance(triple[2], Literal):
if triple[2].language is not None:
language = f"{triple[2].language}"
else:
language = self.empty
else:
language = self.empty
line_list = [
self._iri_or_bn(triple[0]),
triple[1],
value,
datatype,
language,
context_str,
]
outline: Union[str, bytes]
if _HAS_ORJSON:
outline = orjson.dumps(line_list, option=orjson.OPT_APPEND_NEWLINE)
else:
outline = json.dumps(line_list) + "\n"
return outline
else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects
return None
def _iri_or_bn(self, i_):
if isinstance(i_, URIRef):
return f"{i_}"
elif isinstance(i_, BNode):
return f"{i_.n3()}"
else:
return None
def _context_str(self, context: Union[Graph, IdentifiedNode]) -> str:
context_identifier: IdentifiedNode = (
context.identifier if isinstance(context, Graph) else context
)
if context_identifier == DATASET_DEFAULT_GRAPH_ID:
return ""
if self.default_context is not None:
if (
isinstance(self.default_context, IdentifiedNode)
and context_identifier == self.default_context
):
return ""
elif (
isinstance(self.default_context, Graph)
and context_identifier == self.default_context.identifier
):
return ""
if self.graph_type is Graph:
# Only emit a context name when serializing a Dataset or ConjunctiveGraph
return ""
return (
f"{context_identifier}"
if isinstance(context_identifier, URIRef)
else context_identifier.n3()
)
@@ -0,0 +1,433 @@
"""
This serialiser will output an RDF Graph as a JSON-LD formatted document. See:
http://json-ld.org/
Example usage::
>>> from rdflib import Graph
>>> testrdf = '''
... @prefix dc: <http://purl.org/dc/terms/> .
... <http://example.org/about>
... dc:title "Someone's Homepage"@en .
... '''
>>> g = Graph().parse(data=testrdf, format='n3')
>>> print(g.serialize(format='json-ld', indent=2))
[
{
"@id": "http://example.org/about",
"http://purl.org/dc/terms/title": [
{
"@language": "en",
"@value": "Someone's Homepage"
}
]
}
]
"""
# From: https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/serializer.py
# NOTE: This code writes the entire JSON object into memory before serialising,
# but we should consider streaming the output to deal with arbitrarily large
# graphs.
from __future__ import annotations
import warnings
from typing import IO, Any, Dict, List, Optional
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Graph, _ObjectType
from rdflib.namespace import RDF, XSD
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Identifier, Literal, URIRef
from ..shared.jsonld.context import UNDEF, Context
from ..shared.jsonld.keys import CONTEXT, GRAPH, ID, LANG, LIST, SET, VOCAB
from ..shared.jsonld.util import _HAS_ORJSON, json, orjson
__all__ = ["JsonLDSerializer", "from_rdf"]
PLAIN_LITERAL_TYPES = {XSD.boolean, XSD.integer, XSD.double, XSD.string}
class JsonLDSerializer(Serializer):
def __init__(self, store: Graph):
super(JsonLDSerializer, self).__init__(store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
# TODO: docstring w. args and return value
encoding = encoding or "utf-8"
if encoding not in ("utf-8", "utf-16"):
warnings.warn(
"JSON should be encoded as unicode. " f"Given encoding was: {encoding}"
)
context_data = kwargs.get("context")
use_native_types = (kwargs.get("use_native_types", False),)
use_rdf_type = kwargs.get("use_rdf_type", False)
auto_compact = kwargs.get("auto_compact", False)
indent = kwargs.get("indent", 2)
separators = kwargs.get("separators", (",", ": "))
sort_keys = kwargs.get("sort_keys", True)
ensure_ascii = kwargs.get("ensure_ascii", False)
obj = from_rdf(
self.store,
context_data,
base,
use_native_types,
use_rdf_type,
auto_compact=auto_compact,
)
if _HAS_ORJSON:
option: int = orjson.OPT_NON_STR_KEYS
if indent is not None:
option |= orjson.OPT_INDENT_2
if sort_keys:
option |= orjson.OPT_SORT_KEYS
if ensure_ascii:
warnings.warn("Cannot use ensure_ascii with orjson")
data_bytes = orjson.dumps(obj, option=option)
stream.write(data_bytes)
else:
data = json.dumps(
obj,
indent=indent,
separators=separators,
sort_keys=sort_keys,
ensure_ascii=ensure_ascii,
)
stream.write(data.encode(encoding, "replace"))
def from_rdf(
graph,
context_data=None,
base=None,
use_native_types=False,
use_rdf_type=False,
auto_compact=False,
startnode=None,
index=False,
):
# TODO: docstring w. args and return value
# TODO: support for index and startnode
if not context_data and auto_compact:
context_data = dict(
(pfx, str(ns))
for (pfx, ns) in graph.namespaces()
if pfx and str(ns) != "http://www.w3.org/XML/1998/namespace"
)
if isinstance(context_data, Context):
context = context_data
context_data = context.to_dict()
else:
context = Context(context_data, base=base)
converter = Converter(context, use_native_types, use_rdf_type)
result = converter.convert(graph)
if converter.context.active:
if isinstance(result, list):
result = {context.get_key(GRAPH): result}
result[CONTEXT] = context_data
return result
class Converter:
def __init__(self, context: Context, use_native_types: bool, use_rdf_type: bool):
self.context = context
self.use_native_types = context.active or use_native_types
self.use_rdf_type = use_rdf_type
def convert(self, graph: Graph):
# TODO: bug in rdflib dataset parsing (nquads et al):
# plain triples end up in separate unnamed graphs (rdflib issue #436)
if graph.context_aware:
# type error: "Graph" has no attribute "contexts"
all_contexts = list(graph.contexts()) # type: ignore[attr-defined]
has_dataset_default_id = any(
c.identifier == DATASET_DEFAULT_GRAPH_ID for c in all_contexts
)
if (
has_dataset_default_id
# # type error: "Graph" has no attribute "contexts"
and graph.default_context.identifier == DATASET_DEFAULT_GRAPH_ID # type: ignore[attr-defined]
):
default_graph = graph.default_context # type: ignore[attr-defined]
else:
default_graph = Graph()
graphs = [default_graph]
default_graph_id = default_graph.identifier
for g in all_contexts:
if g in graphs:
continue
if isinstance(g.identifier, URIRef):
graphs.append(g)
else:
default_graph += g
else:
graphs = [graph]
default_graph_id = graph.identifier
context = self.context
objs: List[Any] = []
for g in graphs:
obj = {}
graphname = None
if isinstance(g.identifier, URIRef):
if g.identifier != default_graph_id:
graphname = context.shrink_iri(g.identifier)
obj[context.id_key] = graphname
nodes = self.from_graph(g)
if not graphname and len(nodes) == 1:
obj.update(nodes[0])
else:
if not nodes:
continue
obj[context.graph_key] = nodes
if objs and objs[0].get(context.get_key(ID)) == graphname:
objs[0].update(obj)
else:
objs.append(obj)
if len(graphs) == 1 and len(objs) == 1 and not self.context.active:
default = objs[0]
items = default.get(context.graph_key)
if len(default) == 1 and items:
objs = items
elif len(objs) == 1 and self.context.active:
objs = objs[0]
return objs
def from_graph(self, graph: Graph):
nodemap: Dict[Any, Any] = {}
for s in set(graph.subjects()):
## only iri:s and unreferenced (rest will be promoted to top if needed)
if isinstance(s, URIRef) or (
isinstance(s, BNode) and not any(graph.subjects(None, s))
):
self.process_subject(graph, s, nodemap)
return list(nodemap.values())
def process_subject(self, graph: Graph, s: IdentifiedNode, nodemap):
if isinstance(s, URIRef):
node_id = self.context.shrink_iri(s)
elif isinstance(s, BNode):
node_id = s.n3()
else:
# This does not seem right, this probably should be an error.
node_id = None
# used_as_object = any(graph.subjects(None, s))
if node_id in nodemap:
return None
node = {}
node[self.context.id_key] = node_id
nodemap[node_id] = node
for p, o in graph.predicate_objects(s):
# type error: Argument 3 to "add_to_node" of "Converter" has incompatible type "Node"; expected "IdentifiedNode"
# type error: Argument 4 to "add_to_node" of "Converter" has incompatible type "Node"; expected "Identifier"
self.add_to_node(graph, s, p, o, node, nodemap) # type: ignore[arg-type]
return node
def add_to_node(
self,
graph: Graph,
s: IdentifiedNode,
p: IdentifiedNode,
o: Identifier,
s_node: Dict[str, Any],
nodemap,
):
context = self.context
if isinstance(o, Literal):
datatype = str(o.datatype) if o.datatype else None
language = o.language
term = context.find_term(str(p), datatype, language=language)
else:
containers = [LIST, None] if graph.value(o, RDF.first) else [None]
for container in containers:
for coercion in (ID, VOCAB, UNDEF):
# type error: Argument 2 to "find_term" of "Context" has incompatible type "object"; expected "Union[str, Defined, None]"
# type error: Argument 3 to "find_term" of "Context" has incompatible type "Optional[str]"; expected "Union[Defined, str]"
term = context.find_term(str(p), coercion, container) # type: ignore[arg-type]
if term:
break
if term:
break
node = None
use_set = not context.active
if term:
p_key = term.name
if term.type:
node = self.type_coerce(o, term.type)
# type error: "Identifier" has no attribute "language"
elif term.language and o.language == term.language: # type: ignore[attr-defined]
node = str(o)
# type error: Right operand of "and" is never evaluated
elif context.language and (term.language is None and o.language is None): # type: ignore[unreachable]
node = str(o) # type: ignore[unreachable]
if LIST in term.container:
node = [
self.type_coerce(v, term.type)
or self.to_raw_value(graph, s, v, nodemap)
for v in self.to_collection(graph, o)
]
elif LANG in term.container and language:
value = s_node.setdefault(p_key, {})
values = value.get(language)
node = str(o)
if values or SET in term.container:
if not isinstance(values, list):
value[language] = values = [values]
values.append(node)
else:
value[language] = node
return
elif SET in term.container:
use_set = True
else:
p_key = context.to_symbol(p)
# TODO: for coercing curies - quite clumsy; unify to_symbol and find_term?
key_term = context.terms.get(p_key)
if key_term and (key_term.type or key_term.container):
p_key = p
if not term and p == RDF.type and not self.use_rdf_type:
if isinstance(o, URIRef):
node = context.to_symbol(o)
p_key = context.type_key
if node is None:
node = self.to_raw_value(graph, s, o, nodemap)
value = s_node.get(p_key)
if value:
if not isinstance(value, list):
value = [value]
value.append(node)
elif use_set:
value = [node]
else:
value = node
s_node[p_key] = value
def type_coerce(self, o: Identifier, coerce_type: str):
if coerce_type == ID:
if isinstance(o, URIRef):
return self.context.shrink_iri(o)
elif isinstance(o, BNode):
return o.n3()
else:
return o
elif coerce_type == VOCAB and isinstance(o, URIRef):
return self.context.to_symbol(o)
elif isinstance(o, Literal) and str(o.datatype) == coerce_type:
return o
else:
return None
def to_raw_value(
self, graph: Graph, s: IdentifiedNode, o: Identifier, nodemap: Dict[str, Any]
):
context = self.context
coll = self.to_collection(graph, o)
if coll is not None:
coll = [
self.to_raw_value(graph, s, lo, nodemap)
for lo in self.to_collection(graph, o)
]
return {context.list_key: coll}
elif isinstance(o, BNode):
embed = (
False # TODO: self.context.active or using startnode and only one ref
)
onode = self.process_subject(graph, o, nodemap)
if onode:
if embed and not any(s2 for s2 in graph.subjects(None, o) if s2 != s):
return onode
else:
nodemap[onode[context.id_key]] = onode
return {context.id_key: o.n3()}
elif isinstance(o, URIRef):
# TODO: embed if o != startnode (else reverse)
return {context.id_key: context.shrink_iri(o)}
elif isinstance(o, Literal):
# TODO: if compact
native = self.use_native_types and o.datatype in PLAIN_LITERAL_TYPES
if native:
v = o.toPython()
else:
v = str(o)
if o.datatype:
if native and self.context.active:
return v
return {
context.type_key: context.to_symbol(o.datatype),
context.value_key: v,
}
elif o.language and o.language != context.language:
return {context.lang_key: o.language, context.value_key: v}
# type error: Right operand of "and" is never evaluated
elif not context.active or context.language and not o.language: # type: ignore[unreachable]
return {context.value_key: v}
else:
return v
def to_collection(self, graph: Graph, l_: Identifier):
if l_ != RDF.nil and not graph.value(l_, RDF.first):
return None
list_nodes: List[Optional[_ObjectType]] = []
chain = set([l_])
while l_:
if l_ == RDF.nil:
return list_nodes
if isinstance(l_, URIRef):
return None
first, rest = None, None
for p, o in graph.predicate_objects(l_):
if not first and p == RDF.first:
first = o
elif not rest and p == RDF.rest:
rest = o
elif p != RDF.type or o != RDF.List:
return None
list_nodes.append(first)
# type error: Incompatible types in assignment (expression has type "Optional[Node]", variable has type "Identifier")
l_ = rest # type: ignore[assignment]
if l_ in chain:
return None
chain.add(l_)
@@ -0,0 +1,326 @@
"""
LongTurtle RDF graph serializer for RDFLib.
See <http://www.w3.org/TeamSubmission/turtle/> for syntax specification.
This variant, longturtle as opposed to just turtle, makes some small format changes
to turtle - the original turtle serializer. It:
* uses PREFIX instead of @prefix
* uses BASE instead of @base
* adds a new line at RDF.type, or 'a'
* adds a newline and an indent for all triples with more than one object (object list)
* adds a new line and ';' for the last triple in a set with '.'
on the start of the next line
* uses default encoding (encode()) is used instead of "latin-1"
- Nicholas Car, 2023
"""
from __future__ import annotations
from typing import IO, Any, Optional
from rdflib.compare import to_canonical_graph
from rdflib.exceptions import Error
from rdflib.graph import Graph
from rdflib.namespace import RDF
from rdflib.term import BNode, Literal, URIRef
from .turtle import RecursiveSerializer
__all__ = ["LongTurtleSerializer"]
SUBJECT = 0
VERB = 1
OBJECT = 2
_GEN_QNAME_FOR_DT = False
_SPACIOUS_OUTPUT = False
class LongTurtleSerializer(RecursiveSerializer):
short_name = "longturtle"
indentString = " "
def __init__(self, store):
self._ns_rewrite = {}
store = to_canonical_graph(store)
content = store.serialize(format="application/n-triples")
lines = content.split("\n")
lines.sort()
graph = Graph()
graph.parse(
data="\n".join(lines), format="application/n-triples", skolemize=True
)
graph = graph.de_skolemize()
super(LongTurtleSerializer, self).__init__(graph)
self.keywords = {RDF.type: "a"}
self.reset()
self.stream = None
self._spacious: bool = _SPACIOUS_OUTPUT
def addNamespace(self, prefix, namespace):
# Turtle does not support prefixes that start with _
# if they occur in the graph, rewrite to p_blah
# this is more complicated since we need to make sure p_blah
# does not already exist. And we register namespaces as we go, i.e.
# we may first see a triple with prefix _9 - rewrite it to p_9
# and then later find a triple with a "real" p_9 prefix
# so we need to keep track of ns rewrites we made so far.
if (prefix > "" and prefix[0] == "_") or self.namespaces.get(
prefix, namespace
) != namespace:
if prefix not in self._ns_rewrite:
p = "p" + prefix
while p in self.namespaces:
p = "p" + p
self._ns_rewrite[prefix] = p
prefix = self._ns_rewrite.get(prefix, prefix)
super(LongTurtleSerializer, self).addNamespace(prefix, namespace)
return prefix
def reset(self):
super(LongTurtleSerializer, self).reset()
self._shortNames = {}
self._started = False
self._ns_rewrite = {}
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
spacious: Optional[bool] = None,
**kwargs: Any,
) -> None:
self.reset()
self.stream = stream
# if base is given here, use, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
if spacious is not None:
self._spacious = spacious
self.preprocess()
subjects_list = self.orderSubjects()
self.startDocument()
firstTime = True
for subject in subjects_list:
if self.isDone(subject):
continue
if firstTime:
firstTime = False
if self.statement(subject) and not firstTime:
self.write("\n")
self.endDocument()
self.base = None
def preprocessTriple(self, triple):
super(LongTurtleSerializer, self).preprocessTriple(triple)
for i, node in enumerate(triple):
if node in self.keywords:
continue
# Don't use generated prefixes for subjects and objects
self.getQName(node, gen_prefix=(i == VERB))
if isinstance(node, Literal) and node.datatype:
self.getQName(node.datatype, gen_prefix=_GEN_QNAME_FOR_DT)
p = triple[1]
if isinstance(p, BNode): # hmm - when is P ever a bnode?
self._references[p] += 1
def getQName(self, uri, gen_prefix=True):
if not isinstance(uri, URIRef):
return None
try:
parts = self.store.compute_qname(uri, generate=gen_prefix)
except Exception:
# is the uri a namespace in itself?
pfx = self.store.store.prefix(uri)
if pfx is not None:
parts = (pfx, uri, "")
else:
# nothing worked
return None
prefix, namespace, local = parts
# QName cannot end with .
if local.endswith("."):
return None
prefix = self.addNamespace(prefix, namespace)
return "%s:%s" % (prefix, local)
def startDocument(self):
self._started = True
ns_list = sorted(self.namespaces.items())
if self.base:
self.write(self.indent() + "BASE <%s>\n" % self.base)
for prefix, uri in ns_list:
self.write(self.indent() + "PREFIX %s: <%s>\n" % (prefix, uri))
if ns_list and self._spacious:
self.write("\n")
def endDocument(self):
if self._spacious:
self.write("\n")
def statement(self, subject):
self.subjectDone(subject)
return self.s_squared(subject) or self.s_default(subject)
def s_default(self, subject):
self.write("\n" + self.indent())
self.path(subject, SUBJECT)
self.write("\n" + self.indent())
self.predicateList(subject)
self.write("\n.")
return True
def s_squared(self, subject):
if (self._references[subject] > 0) or not isinstance(subject, BNode):
return False
self.write("\n" + self.indent() + "[]")
self.predicateList(subject, newline=False)
self.write("\n.")
return True
def path(self, node, position, newline=False):
if not (
self.p_squared(node, position) or self.p_default(node, position, newline)
):
raise Error("Cannot serialize node '%s'" % (node,))
def p_default(self, node, position, newline=False):
if position != SUBJECT and not newline:
self.write(" ")
self.write(self.label(node, position))
return True
def label(self, node, position):
if node == RDF.nil:
return "()"
if position is VERB and node in self.keywords:
return self.keywords[node]
if isinstance(node, Literal):
return node._literal_n3(
use_plain=True,
qname_callback=lambda dt: self.getQName(dt, _GEN_QNAME_FOR_DT),
)
else:
node = self.relativize(node)
return self.getQName(node, position == VERB) or node.n3()
def p_squared(
self,
node,
position,
):
if (
not isinstance(node, BNode)
or node in self._serialized
or self._references[node] > 1
or position == SUBJECT
):
return False
if self.isValidList(node):
# this is a list
self.depth += 2
self.write(" (\n")
self.depth -= 2
self.doList(node)
self.write("\n" + self.indent() + ")")
else:
# this is a Blank Node
self.subjectDone(node)
self.write("\n" + self.indent(1) + "[\n")
self.depth += 1
self.predicateList(node)
self.depth -= 1
self.write("\n" + self.indent(1) + "]")
return True
def isValidList(self, l_):
"""
Checks if l is a valid RDF list, i.e. no nodes have other properties.
"""
try:
if self.store.value(l_, RDF.first) is None:
return False
except Exception:
return False
while l_:
if l_ != RDF.nil and len(list(self.store.predicate_objects(l_))) != 2:
return False
l_ = self.store.value(l_, RDF.rest)
return True
def doList(self, l_):
i = 0
while l_:
item = self.store.value(l_, RDF.first)
if item is not None:
if i == 0:
self.write(self.indent(1))
else:
self.write("\n" + self.indent(1))
self.path(item, OBJECT, newline=True)
self.subjectDone(l_)
l_ = self.store.value(l_, RDF.rest)
i += 1
def predicateList(self, subject, newline=False):
properties = self.buildPredicateHash(subject)
propList = self.sortProperties(properties)
if len(propList) == 0:
return
self.write(self.indent(1))
self.verb(propList[0], newline=True)
self.objectList(properties[propList[0]])
for predicate in propList[1:]:
self.write(" ;\n" + self.indent(1))
self.verb(predicate, newline=True)
self.objectList(properties[predicate])
self.write(" ;")
def verb(self, node, newline=False):
self.path(node, VERB, newline)
def objectList(self, objects):
count = len(objects)
if count == 0:
return
depthmod = (count == 1) and 0 or 1
self.depth += depthmod
first_nl = False
if count > 1:
if not isinstance(objects[0], BNode):
self.write("\n" + self.indent(1))
else:
self.write(" ")
first_nl = True
self.path(objects[0], OBJECT, newline=first_nl)
for obj in objects[1:]:
self.write(" ,")
if not isinstance(obj, BNode):
self.write("\n" + self.indent(1))
self.path(obj, OBJECT, newline=True)
self.depth -= depthmod
@@ -0,0 +1,91 @@
"""
Notation 3 (N3) RDF graph serializer for RDFLib.
"""
from rdflib.graph import Graph
from rdflib.namespace import OWL, Namespace
from rdflib.plugins.serializers.turtle import OBJECT, SUBJECT, TurtleSerializer
__all__ = ["N3Serializer"]
SWAP_LOG = Namespace("http://www.w3.org/2000/10/swap/log#")
class N3Serializer(TurtleSerializer):
short_name = "n3"
def __init__(self, store: Graph, parent=None):
super(N3Serializer, self).__init__(store)
self.keywords.update({OWL.sameAs: "=", SWAP_LOG.implies: "=>"})
self.parent = parent
def reset(self):
super(N3Serializer, self).reset()
self._stores = {}
def endDocument(self): # noqa: N802
if not self.parent:
super(N3Serializer, self).endDocument()
def indent(self, modifier=0):
indent = super(N3Serializer, self).indent(modifier)
if self.parent is not None:
indent += self.parent.indent() # modifier)
return indent
def preprocessTriple(self, triple): # noqa: N802
super(N3Serializer, self).preprocessTriple(triple)
if isinstance(triple[0], Graph):
for t in triple[0]:
self.preprocessTriple(t)
if isinstance(triple[1], Graph):
for t in triple[1]:
self.preprocessTriple(t)
if isinstance(triple[2], Graph):
for t in triple[2]:
self.preprocessTriple(t)
def getQName(self, uri, gen_prefix=True): # noqa: N802
qname = None
if self.parent is not None:
qname = self.parent.getQName(uri, gen_prefix)
if qname is None:
qname = super(N3Serializer, self).getQName(uri, gen_prefix)
return qname
def statement(self, subject):
self.subjectDone(subject)
properties = self.buildPredicateHash(subject)
if len(properties) == 0:
return False
return self.s_clause(subject) or super(N3Serializer, self).statement(subject)
def path(self, node, position, newline=False):
if not self.p_clause(node, position):
super(N3Serializer, self).path(node, position, newline)
def s_clause(self, subject):
if isinstance(subject, Graph):
self.write("\n" + self.indent())
self.p_clause(subject, SUBJECT)
self.predicateList(subject)
self.write(" .")
return True
else:
return False
def p_clause(self, node, position):
if isinstance(node, Graph):
self.subjectDone(node)
if position is OBJECT:
self.write(" ")
self.write("{")
self.depth += 1
serializer = N3Serializer(node, parent=self)
# type error: Argument 1 to "serialize" of "TurtleSerializer" has incompatible type "Optional[IO[bytes]]"; expected "IO[bytes]"
serializer.serialize(self.stream) # type: ignore[arg-type]
self.depth -= 1
self.write(self.indent() + "}")
return True
else:
return False
@@ -0,0 +1,61 @@
from __future__ import annotations
import warnings
from typing import IO, Any, Optional
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.plugins.serializers.nt import _quoteLiteral
from rdflib.serializer import Serializer
from rdflib.term import Literal
__all__ = ["NQuadsSerializer"]
class NQuadsSerializer(Serializer):
def __init__(self, store: Graph):
if not store.context_aware:
raise Exception(
"NQuads serialization only makes " "sense for context-aware stores!"
)
super(NQuadsSerializer, self).__init__(store)
self.store: ConjunctiveGraph
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
if base is not None:
warnings.warn("NQuadsSerializer does not support base.")
if encoding is not None and encoding.lower() != self.encoding.lower():
warnings.warn(
"NQuadsSerializer does not use custom encoding. "
f"Given encoding was: {encoding}"
)
encoding = self.encoding
for context in self.store.contexts():
for triple in context:
stream.write(
_nq_row(triple, context.identifier).encode(encoding, "replace")
)
stream.write("\n".encode("latin-1"))
def _nq_row(triple, context):
if isinstance(triple[2], Literal):
return "%s %s %s %s .\n" % (
triple[0].n3(),
triple[1].n3(),
_quoteLiteral(triple[2]),
context.n3(),
)
else:
return "%s %s %s %s .\n" % (
triple[0].n3(),
triple[1].n3(),
triple[2].n3(),
context.n3(),
)
@@ -0,0 +1,115 @@
from __future__ import annotations
import codecs
import warnings
from typing import IO, TYPE_CHECKING, Any, Optional, Tuple, Union
from rdflib.graph import Graph
from rdflib.serializer import Serializer
from rdflib.term import Literal
if TYPE_CHECKING:
from rdflib.graph import _TripleType
"""
N-Triples RDF graph serializer for RDFLib.
See <http://www.w3.org/TR/rdf-testcases/#ntriples> for details about the
format.
"""
__all__ = ["NTSerializer"]
class NTSerializer(Serializer):
"""
Serializes RDF graphs to NTriples format.
"""
def __init__(self, store: Graph):
Serializer.__init__(self, store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = "utf-8",
**kwargs: Any,
) -> None:
if base is not None:
warnings.warn("NTSerializer does not support base.")
if encoding != "utf-8":
warnings.warn(
"NTSerializer always uses UTF-8 encoding. "
f"Given encoding was: {encoding}"
)
for triple in self.store:
stream.write(_nt_row(triple).encode())
class NT11Serializer(NTSerializer):
"""
Serializes RDF graphs to RDF 1.1 NTriples format.
Exactly like nt - only utf8 encoded.
"""
def __init__(self, store: Graph):
Serializer.__init__(self, store) # default to utf-8
def _nt_row(triple: _TripleType) -> str:
if isinstance(triple[2], Literal):
return "%s %s %s .\n" % (
triple[0].n3(),
triple[1].n3(),
_quoteLiteral(triple[2]),
)
else:
return "%s %s %s .\n" % (triple[0].n3(), triple[1].n3(), triple[2].n3())
def _quoteLiteral(l_: Literal) -> str: # noqa: N802
"""
a simpler version of term.Literal.n3()
"""
encoded = _quote_encode(l_)
if l_.language:
if l_.datatype:
raise Exception("Literal has datatype AND language!")
return "%s@%s" % (encoded, l_.language)
elif l_.datatype:
return "%s^^<%s>" % (encoded, l_.datatype)
else:
return "%s" % encoded
def _quote_encode(l_: str) -> str:
return '"%s"' % l_.replace("\\", "\\\\").replace("\n", "\\n").replace(
'"', '\\"'
).replace("\r", "\\r")
def _nt_unicode_error_resolver(
err: UnicodeError,
) -> Tuple[Union[str, bytes], int]:
"""
Do unicode char replaces as defined in https://www.w3.org/TR/2004/REC-rdf-testcases-20040210/#ntrip_strings
"""
def _replace_single(c):
c = ord(c)
fmt = "\\u%04X" if c <= 0xFFFF else "\\U%08X"
return fmt % c
# type error: "UnicodeError" has no attribute "object"
# type error: "UnicodeError" has no attribute "start"
# type error: "UnicodeError" has no attribute "end"
string = err.object[err.start : err.end] # type: ignore[attr-defined]
# type error: "UnicodeError" has no attribute "end"
return "".join(_replace_single(c) for c in string), err.end # type: ignore[attr-defined]
codecs.register_error("_rdflib_nt_escape", _nt_unicode_error_resolver)
@@ -0,0 +1,108 @@
from __future__ import annotations
import warnings
from typing import IO, Any, Optional
from uuid import uuid4
from rdflib import Dataset
from rdflib.plugins.serializers.nquads import _nq_row
from rdflib.plugins.serializers.nt import _nt_row
from rdflib.serializer import Serializer
add_remove_methods = {"add": "A", "remove": "D"}
class PatchSerializer(Serializer):
"""
Creates an RDF patch file to add and remove triples/quads.
Can either:
- Create an add or delete patch for a single Dataset.
- Create a patch to represent the difference between two Datasets.
"""
def __init__(
self,
store: Dataset,
):
self.store: Dataset = store
super().__init__(store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
"""
Serialize the store to the given stream.
:param stream: The stream to serialize to.
:param base: The base URI to use for the serialization.
:param encoding: The encoding to use for the serialization.
:param kwargs: Additional keyword arguments.
Supported keyword arguments:
- operation: The operation to perform. Either 'add' or 'remove'.
- target: The target Dataset to compare against.
NB: Only one of 'operation' or 'target' should be provided.
- header_id: The header ID to use.
- header_prev: The previous header ID to use.
"""
operation = kwargs.get("operation")
target = kwargs.get("target")
header_id = kwargs.get("header_id")
header_prev = kwargs.get("header_prev")
if not header_id:
header_id = f"uuid:{uuid4()}"
encoding = self.encoding
if base is not None:
warnings.warn("PatchSerializer does not support base.")
if encoding is not None and encoding.lower() != self.encoding.lower():
warnings.warn(
"PatchSerializer does not use custom encoding. "
f"Given encoding was: {encoding}"
)
def write_header():
stream.write(f"H id <{header_id}> .\n".encode(encoding, "replace"))
if header_prev:
stream.write(f"H prev <{header_prev}>\n".encode(encoding, "replace"))
stream.write("TX .\n".encode(encoding, "replace"))
def write_triples(contexts, op_code, use_passed_contexts=False):
for context in contexts:
if not use_passed_contexts:
context = self.store.get_context(context.identifier)
for triple in context:
stream.write(
self._patch_row(triple, context.identifier, op_code).encode(
encoding, "replace"
)
)
if operation:
assert operation in add_remove_methods, f"Invalid operation: {operation}"
elif not target:
# No operation specified and no target specified
# Fall back to default operation of "add" to prevent a no-op
operation = "add"
write_header()
if operation:
operation_code = add_remove_methods.get(operation)
write_triples(self.store.contexts(), operation_code)
elif target:
to_add, to_remove = self._diff(target)
write_triples(to_add.contexts(), "A", use_passed_contexts=True)
write_triples(to_remove.contexts(), "D", use_passed_contexts=True)
stream.write("TC .\n".encode(encoding, "replace"))
def _diff(self, target):
rows_to_add = target - self.store
rows_to_remove = self.store - target
return rows_to_add, rows_to_remove
def _patch_row(self, triple, context_id, operation):
if context_id == self.store.default_context.identifier:
return f"{operation} {_nt_row(triple)}"
else:
return f"{operation} {_nq_row(triple, context_id)}"
@@ -0,0 +1,391 @@
from __future__ import annotations
import xml.dom.minidom
from typing import IO, Any, Dict, Generator, Optional, Set, Tuple
from xml.sax.saxutils import escape, quoteattr
from rdflib.collection import Collection
from rdflib.graph import Graph
from rdflib.namespace import RDF, RDFS, Namespace # , split_uri
from rdflib.plugins.parsers.RDFVOC import RDFVOC
from rdflib.plugins.serializers.xmlwriter import XMLWriter
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Identifier, Literal, Node, URIRef
from rdflib.util import first, more_than
from .xmlwriter import ESCAPE_ENTITIES
__all__ = ["fix", "XMLSerializer", "PrettyXMLSerializer"]
class XMLSerializer(Serializer):
def __init__(self, store: Graph):
super(XMLSerializer, self).__init__(store)
def __bindings(self) -> Generator[Tuple[str, URIRef], None, None]:
store = self.store
nm = store.namespace_manager
bindings: Dict[str, URIRef] = {}
for predicate in set(store.predicates()):
# type error: Argument 1 to "compute_qname_strict" of "NamespaceManager" has incompatible type "Node"; expected "str"
prefix, namespace, name = nm.compute_qname_strict(predicate) # type: ignore[arg-type]
bindings[prefix] = URIRef(namespace)
RDFNS = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#") # noqa: N806
if "rdf" in bindings:
assert bindings["rdf"] == RDFNS
else:
bindings["rdf"] = RDFNS
for prefix, namespace in bindings.items():
yield prefix, namespace
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
self.__stream = stream
self.__serialized: Dict[Identifier, int] = {}
encoding = self.encoding
self.write = write = lambda uni: stream.write(uni.encode(encoding, "replace"))
# startDocument
write('<?xml version="1.0" encoding="%s"?>\n' % self.encoding)
# startRDF
write("<rdf:RDF\n")
# If provided, write xml:base attribute for the RDF
if "xml_base" in kwargs:
write(' xml:base="%s"\n' % kwargs["xml_base"])
elif self.base:
write(' xml:base="%s"\n' % self.base)
# TODO:
# assert(
# namespaces["http://www.w3.org/1999/02/22-rdf-syntax-ns#"]=='rdf')
bindings = list(self.__bindings())
bindings.sort()
for prefix, namespace in bindings:
if prefix:
write(' xmlns:%s="%s"\n' % (prefix, namespace))
else:
write(' xmlns="%s"\n' % namespace)
write(">\n")
# write out triples by subject
for subject in self.store.subjects():
# type error: Argument 1 to "subject" of "XMLSerializer" has incompatible type "Node"; expected "Identifier"
self.subject(subject, 1) # type: ignore[arg-type]
# endRDF
write("</rdf:RDF>\n")
# Set to None so that the memory can get garbage collected.
# self.__serialized = None
del self.__serialized
def subject(self, subject: Identifier, depth: int = 1) -> None:
if subject not in self.__serialized:
self.__serialized[subject] = 1
if isinstance(subject, (BNode, URIRef)):
write = self.write
indent = " " * depth
element_name = "rdf:Description"
if isinstance(subject, BNode):
write('%s<%s rdf:nodeID="%s"' % (indent, element_name, subject))
else:
uri = quoteattr(self.relativize(subject))
write("%s<%s rdf:about=%s" % (indent, element_name, uri))
if (subject, None, None) in self.store:
write(">\n")
for predicate, object in self.store.predicate_objects(subject):
# type error: Argument 1 to "predicate" of "XMLSerializer" has incompatible type "Node"; expected "Identifier"
# type error: Argument 2 to "predicate" of "XMLSerializer" has incompatible type "Node"; expected "Identifier"
self.predicate(predicate, object, depth + 1) # type: ignore[arg-type]
write("%s</%s>\n" % (indent, element_name))
else:
write("/>\n")
def predicate(
self, predicate: Identifier, object: Identifier, depth: int = 1
) -> None:
write = self.write
indent = " " * depth
qname = self.store.namespace_manager.qname_strict(predicate)
if isinstance(object, Literal):
attributes = ""
if object.language:
attributes += ' xml:lang="%s"' % object.language
if object.datatype:
attributes += ' rdf:datatype="%s"' % object.datatype
write(
"%s<%s%s>%s</%s>\n"
% (indent, qname, attributes, escape(object, ESCAPE_ENTITIES), qname)
)
else:
if isinstance(object, BNode):
write('%s<%s rdf:nodeID="%s"/>\n' % (indent, qname, object))
else:
write(
"%s<%s rdf:resource=%s/>\n"
% (indent, qname, quoteattr(self.relativize(object)))
)
XMLLANG = "http://www.w3.org/XML/1998/namespacelang"
XMLBASE = "http://www.w3.org/XML/1998/namespacebase"
OWL_NS = Namespace("http://www.w3.org/2002/07/owl#")
# TODO:
def fix(val: str) -> str:
"strip off _: from nodeIDs... as they are not valid NCNames"
if val.startswith("_:"):
return val[2:]
else:
return val
class PrettyXMLSerializer(Serializer):
def __init__(self, store: Graph, max_depth=3):
super(PrettyXMLSerializer, self).__init__(store)
self.forceRDFAbout: Set[URIRef] = set()
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
self.__serialized: Dict[Identifier, int] = {}
store = self.store
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif store.base is not None:
self.base = store.base
self.max_depth = kwargs.get("max_depth", 3)
assert self.max_depth > 0, "max_depth must be greater than 0"
self.nm = nm = store.namespace_manager
self.writer = writer = XMLWriter(stream, nm, encoding)
namespaces = {}
possible: Set[Node] = set(store.predicates()).union(
store.objects(None, RDF.type)
)
for predicate in possible:
# type error: Argument 1 to "compute_qname_strict" of "NamespaceManager" has incompatible type "Node"; expected "str"
prefix, namespace, local = nm.compute_qname_strict(predicate) # type: ignore[arg-type]
namespaces[prefix] = namespace
namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
writer.push(RDFVOC.RDF)
if "xml_base" in kwargs:
writer.attribute(XMLBASE, kwargs["xml_base"])
elif self.base:
writer.attribute(XMLBASE, self.base)
writer.namespaces(namespaces.items())
subject: IdentifiedNode
# Write out subjects that can not be inline
# type error: Incompatible types in assignment (expression has type "Node", variable has type "IdentifiedNode")
for subject in store.subjects(): # type: ignore[assignment]
if (None, None, subject) in store:
if (subject, None, subject) in store:
self.subject(subject, 1)
else:
self.subject(subject, 1)
# write out anything that has not yet been reached
# write out BNodes last (to ensure they can be inlined where possible)
bnodes = set()
# type error: Incompatible types in assignment (expression has type "Node", variable has type "IdentifiedNode")
for subject in store.subjects(): # type: ignore[assignment]
if isinstance(subject, BNode):
bnodes.add(subject)
continue
self.subject(subject, 1)
# now serialize only those BNodes that have not been serialized yet
for bnode in bnodes:
if bnode not in self.__serialized:
self.subject(subject, 1)
writer.pop(RDFVOC.RDF)
stream.write("\n".encode("latin-1"))
# Set to None so that the memory can get garbage collected.
self.__serialized = None # type: ignore[assignment]
def subject(self, subject: Identifier, depth: int = 1):
store = self.store
writer = self.writer
if subject in self.forceRDFAbout:
writer.push(RDFVOC.Description)
writer.attribute(RDFVOC.about, self.relativize(subject))
writer.pop(RDFVOC.Description)
self.forceRDFAbout.remove(subject) # type: ignore[arg-type]
elif subject not in self.__serialized:
self.__serialized[subject] = 1
type = first(store.objects(subject, RDF.type))
try:
# type error: Argument 1 to "qname" of "NamespaceManager" has incompatible type "Optional[Node]"; expected "str"
self.nm.qname(type) # type: ignore[arg-type]
except Exception:
type = None
element = type or RDFVOC.Description
# type error: Argument 1 to "push" of "XMLWriter" has incompatible type "Node"; expected "str"
writer.push(element) # type: ignore[arg-type]
if isinstance(subject, BNode):
def subj_as_obj_more_than(ceil):
return True
# more_than(store.triples((None, None, subject)), ceil)
# here we only include BNode labels if they are referenced
# more than once (this reduces the use of redundant BNode
# identifiers)
if subj_as_obj_more_than(1):
writer.attribute(RDFVOC.nodeID, fix(subject))
else:
writer.attribute(RDFVOC.about, self.relativize(subject))
if (subject, None, None) in store:
for predicate, object in store.predicate_objects(subject):
if not (predicate == RDF.type and object == type):
# type error: Argument 1 to "predicate" of "PrettyXMLSerializer" has incompatible type "Node"; expected "Identifier"
# type error: Argument 2 to "predicate" of "PrettyXMLSerializer" has incompatible type "Node"; expected "Identifier"
self.predicate(predicate, object, depth + 1) # type: ignore[arg-type]
# type error: Argument 1 to "pop" of "XMLWriter" has incompatible type "Node"; expected "Optional[str]"
writer.pop(element) # type: ignore[arg-type]
elif subject in self.forceRDFAbout:
# TODO FIXME?: this looks like a duplicate of first condition
writer.push(RDFVOC.Description)
writer.attribute(RDFVOC.about, self.relativize(subject))
writer.pop(RDFVOC.Description)
self.forceRDFAbout.remove(subject) # type: ignore[arg-type]
def predicate(
self, predicate: Identifier, object: Identifier, depth: int = 1
) -> None:
writer = self.writer
store = self.store
writer.push(predicate)
if isinstance(object, Literal):
if object.language:
writer.attribute(XMLLANG, object.language)
if object.datatype == RDF.XMLLiteral and isinstance(
object.value, xml.dom.minidom.Document
):
writer.attribute(RDFVOC.parseType, "Literal")
writer.text("")
writer.stream.write(object)
else:
if object.datatype:
writer.attribute(RDFVOC.datatype, object.datatype)
writer.text(object)
elif (
object in self.__serialized
or not (object, None, None) in store # noqa: E713
):
if isinstance(object, BNode):
if more_than(store.triples((None, None, object)), 0):
writer.attribute(RDFVOC.nodeID, fix(object))
else:
writer.attribute(RDFVOC.resource, self.relativize(object))
else:
if first(store.objects(object, RDF.first)): # may not have type
# RDF.List
self.__serialized[object] = 1
# Warn that any assertions on object other than
# RDF.first and RDF.rest are ignored... including RDF.List
import warnings
warnings.warn(
"Assertions on %s other than RDF.first " % repr(object)
+ "and RDF.rest are ignored ... including RDF.List",
UserWarning,
stacklevel=2,
)
writer.attribute(RDFVOC.parseType, "Collection")
col = Collection(store, object)
for item in col:
if isinstance(item, URIRef):
self.forceRDFAbout.add(item)
# type error: Argument 1 to "subject" of "PrettyXMLSerializer" has incompatible type "Node"; expected "Identifier"
self.subject(item) # type: ignore[arg-type]
if not isinstance(item, URIRef):
# type error: Invalid index type "Node" for "Dict[Identifier, int]"; expected type "Identifier"
self.__serialized[item] = 1 # type: ignore[index]
else:
if first(
store.triples_choices(
# type error: Argument 1 to "triples_choices" of "Graph" has incompatible type "Tuple[Identifier, URIRef, List[URIRef]]"; expected "Union[Tuple[List[Node], Node, Node], Tuple[Node, List[Node], Node], Tuple[Node, Node, List[Node]]]"
(object, RDF.type, [OWL_NS.Class, RDFS.Class]) # type: ignore[arg-type]
)
) and isinstance(object, URIRef):
writer.attribute(RDFVOC.resource, self.relativize(object))
elif depth <= self.max_depth:
self.subject(object, depth + 1)
elif isinstance(object, BNode):
if (
object not in self.__serialized
and (object, None, None) in store
and len(list(store.subjects(object=object))) == 1
):
# inline blank nodes if they haven't been serialized yet
# and are only referenced once (regardless of depth)
self.subject(object, depth + 1)
else:
writer.attribute(RDFVOC.nodeID, fix(object))
else:
writer.attribute(RDFVOC.resource, self.relativize(object))
writer.pop(predicate)
@@ -0,0 +1,121 @@
"""
Trig RDF graph serializer for RDFLib.
See <http://www.w3.org/TR/trig/> for syntax specification.
"""
from __future__ import annotations
from typing import IO, TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.plugins.serializers.turtle import TurtleSerializer
from rdflib.term import BNode, Node
if TYPE_CHECKING:
from rdflib.graph import _ContextType, _SubjectType
__all__ = ["TrigSerializer"]
class TrigSerializer(TurtleSerializer):
short_name = "trig"
indentString = 4 * " "
def __init__(self, store: Union[Graph, ConjunctiveGraph]):
self.default_context: Optional[Node]
if store.context_aware:
if TYPE_CHECKING:
assert isinstance(store, ConjunctiveGraph)
self.contexts = list(store.contexts())
self.default_context = store.default_context.identifier
if store.default_context:
self.contexts.append(store.default_context)
else:
self.contexts = [store]
self.default_context = None
super(TrigSerializer, self).__init__(store)
def preprocess(self) -> None:
for context in self.contexts:
# do not write unnecessary prefix (ex: for an empty default graph)
if len(context) == 0:
continue
self.store = context
# Don't generate a new prefix for a graph URI if one already exists
self.getQName(context.identifier, False)
self._subjects = {}
for triple in context:
self.preprocessTriple(triple)
for subject in self._subjects.keys():
self._references[subject] += 1
self._contexts[context] = (self.orderSubjects(), self._subjects)
def reset(self) -> None:
super(TrigSerializer, self).reset()
self._contexts: Dict[
_ContextType,
Tuple[List[_SubjectType], Dict[_SubjectType, bool]],
] = {}
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
spacious: Optional[bool] = None,
**kwargs: Any,
) -> None:
self.reset()
self.stream = stream
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
if spacious is not None:
self._spacious = spacious
self.preprocess()
self.startDocument()
firstTime = True
for store, (ordered_subjects, subjects) in self._contexts.items():
if not ordered_subjects:
continue
self._serialized = {}
self.store = store
self._subjects = subjects
if self.default_context and store.identifier == self.default_context:
self.write(self.indent() + "\n{")
else:
iri: Optional[str]
if isinstance(store.identifier, BNode):
iri = store.identifier.n3()
else:
# Show the full graph URI if a prefix for it doesn't already exist
iri = self.getQName(store.identifier, False)
if iri is None:
iri = store.identifier.n3()
self.write(self.indent() + "\n%s {" % iri)
self.depth += 1
for subject in ordered_subjects:
if self.isDone(subject):
continue
if firstTime:
firstTime = False
if self.statement(subject) and not firstTime:
self.write("\n")
self.depth -= 1
self.write("}\n")
self.endDocument()
stream.write("\n".encode("latin-1"))
@@ -0,0 +1,91 @@
from __future__ import annotations
from typing import IO, Any, Optional
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.namespace import Namespace
from rdflib.plugins.serializers.xmlwriter import XMLWriter
from rdflib.serializer import Serializer
from rdflib.term import BNode, Literal, URIRef
__all__ = ["TriXSerializer"]
# TODO: Move this somewhere central
TRIXNS = Namespace("http://www.w3.org/2004/03/trix/trix-1/")
XMLNS = Namespace("http://www.w3.org/XML/1998/namespace")
class TriXSerializer(Serializer):
def __init__(self, store: Graph):
super(TriXSerializer, self).__init__(store)
if not store.context_aware:
raise Exception(
"TriX serialization only makes sense for context-aware stores"
)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
nm = self.store.namespace_manager
self.writer = XMLWriter(stream, nm, encoding, extra_ns={"": TRIXNS})
self.writer.push(TRIXNS["TriX"])
# if base is given here, use that, if not and a base is set for the graph use that
if base is None and self.store.base is not None:
base = self.store.base
if base is not None:
self.writer.attribute("http://www.w3.org/XML/1998/namespacebase", base)
self.writer.namespaces()
if isinstance(self.store, ConjunctiveGraph):
for subgraph in self.store.contexts():
self._writeGraph(subgraph)
elif isinstance(self.store, Graph):
self._writeGraph(self.store)
else:
raise Exception(f"Unknown graph type: {type(self.store)}")
self.writer.pop()
stream.write("\n".encode("latin-1"))
def _writeGraph(self, graph): # noqa: N802
self.writer.push(TRIXNS["graph"])
if graph.base:
self.writer.attribute(
"http://www.w3.org/XML/1998/namespacebase", graph.base
)
if isinstance(graph.identifier, URIRef):
self.writer.element(TRIXNS["uri"], content=str(graph.identifier))
for triple in graph.triples((None, None, None)):
self._writeTriple(triple)
self.writer.pop()
def _writeTriple(self, triple): # noqa: N802
self.writer.push(TRIXNS["triple"])
for component in triple:
if isinstance(component, URIRef):
self.writer.element(TRIXNS["uri"], content=str(component))
elif isinstance(component, BNode):
self.writer.element(TRIXNS["id"], content=str(component))
elif isinstance(component, Literal):
if component.datatype:
self.writer.element(
TRIXNS["typedLiteral"],
content=str(component),
attributes={TRIXNS["datatype"]: str(component.datatype)},
)
elif component.language:
self.writer.element(
TRIXNS["plainLiteral"],
content=str(component),
attributes={XMLNS["lang"]: str(component.language)},
)
else:
self.writer.element(TRIXNS["plainLiteral"], content=str(component))
self.writer.pop()
@@ -0,0 +1,453 @@
"""
Turtle RDF graph serializer for RDFLib.
See <http://www.w3.org/TeamSubmission/turtle/> for syntax specification.
"""
from __future__ import annotations
from collections import defaultdict
from typing import (
IO,
TYPE_CHECKING,
Any,
DefaultDict,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
)
from rdflib.exceptions import Error
from rdflib.graph import Graph
from rdflib.namespace import RDF, RDFS
from rdflib.serializer import Serializer
from rdflib.term import BNode, Literal, Node, URIRef
if TYPE_CHECKING:
from rdflib.graph import _PredicateType, _SubjectType, _TripleType
__all__ = ["RecursiveSerializer", "TurtleSerializer"]
class RecursiveSerializer(Serializer):
topClasses = [RDFS.Class]
predicateOrder = [RDF.type, RDFS.label]
maxDepth = 10
indentString = " "
roundtrip_prefixes: Tuple[Any, ...] = ()
def __init__(self, store: Graph):
super(RecursiveSerializer, self).__init__(store)
self.stream: Optional[IO[bytes]] = None
self.reset()
def addNamespace(self, prefix: str, uri: URIRef) -> None:
if prefix in self.namespaces and self.namespaces[prefix] != uri:
raise Exception(
"Trying to override namespace prefix %s => %s, but it's already bound to %s"
% (prefix, uri, self.namespaces[prefix])
)
self.namespaces[prefix] = uri
def checkSubject(self, subject: _SubjectType) -> bool:
"""Check to see if the subject should be serialized yet"""
if (
(self.isDone(subject))
or (subject not in self._subjects)
or ((subject in self._topLevels) and (self.depth > 1))
or (isinstance(subject, URIRef) and (self.depth >= self.maxDepth))
):
return False
return True
def isDone(self, subject: _SubjectType) -> bool:
"""Return true if subject is serialized"""
return subject in self._serialized
def orderSubjects(self) -> List[_SubjectType]:
seen: Dict[_SubjectType, bool] = {}
subjects: List[_SubjectType] = []
for classURI in self.topClasses:
members = list(self.store.subjects(RDF.type, classURI))
members.sort()
subjects.extend(members)
for member in members:
self._topLevels[member] = True
seen[member] = True
recursable = [
(isinstance(subject, BNode), self._references[subject], subject)
for subject in self._subjects
if subject not in seen
]
recursable.sort()
subjects.extend([subject for (isbnode, refs, subject) in recursable])
return subjects
def preprocess(self) -> None:
for triple in self.store.triples((None, None, None)):
self.preprocessTriple(triple)
def preprocessTriple(self, spo: _TripleType) -> None:
s, p, o = spo
self._references[o] += 1
self._subjects[s] = True
def reset(self) -> None:
self.depth = 0
# Typed none because nothing is using it ...
self.lists: Dict[None, None] = {}
self.namespaces: Dict[str, URIRef] = {}
self._references: DefaultDict[Node, int] = defaultdict(int)
self._serialized: Dict[_SubjectType, bool] = {}
self._subjects: Dict[_SubjectType, bool] = {}
self._topLevels: Dict[_SubjectType, bool] = {}
if self.roundtrip_prefixes:
if hasattr(self.roundtrip_prefixes, "__iter__"):
for prefix, ns in self.store.namespaces():
if prefix in self.roundtrip_prefixes:
self.addNamespace(prefix, ns)
else:
for prefix, ns in self.store.namespaces():
self.addNamespace(prefix, ns)
def buildPredicateHash(
self, subject: _SubjectType
) -> Mapping[_PredicateType, List[Node]]:
"""
Build a hash key by predicate to a list of objects for the given
subject
"""
properties: Dict[_PredicateType, List[Node]] = {}
for s, p, o in self.store.triples((subject, None, None)):
oList = properties.get(p, [])
oList.append(o)
properties[p] = oList
return properties
def sortProperties(
self, properties: Mapping[_PredicateType, List[Node]]
) -> List[_PredicateType]:
"""Take a hash from predicate uris to lists of values.
Sort the lists of values. Return a sorted list of properties."""
# Sort object lists
for prop, objects in properties.items():
objects.sort()
# Make sorted list of properties
propList: List[_PredicateType] = []
seen: Dict[_PredicateType, bool] = {}
for prop in self.predicateOrder:
if (prop in properties) and (prop not in seen):
propList.append(prop)
seen[prop] = True
props = list(properties.keys())
props.sort()
for prop in props:
if prop not in seen:
propList.append(prop)
seen[prop] = True
return propList
def subjectDone(self, subject: _SubjectType) -> None:
"""Mark a subject as done."""
self._serialized[subject] = True
def indent(self, modifier: int = 0) -> str:
"""Returns indent string multiplied by the depth"""
return (self.depth + modifier) * self.indentString
def write(self, text: str) -> None:
"""Write text in given encoding."""
# type error: Item "None" of "Optional[IO[bytes]]" has no attribute "write"
self.stream.write(text.encode(self.encoding, "replace")) # type: ignore[union-attr]
SUBJECT = 0
VERB = 1
OBJECT = 2
_GEN_QNAME_FOR_DT = False
_SPACIOUS_OUTPUT = False
class TurtleSerializer(RecursiveSerializer):
short_name = "turtle"
indentString = " "
def __init__(self, store: Graph):
self._ns_rewrite: Dict[str, str] = {}
super(TurtleSerializer, self).__init__(store)
self.keywords: Dict[Node, str] = {RDF.type: "a"}
self.reset()
self.stream = None
self._spacious = _SPACIOUS_OUTPUT
# type error: Return type "str" of "addNamespace" incompatible with return type "None" in supertype "RecursiveSerializer"
def addNamespace(self, prefix: str, namespace: URIRef) -> str: # type: ignore[override]
# Turtle does not support prefix that start with _
# if they occur in the graph, rewrite to p_blah
# this is more complicated since we need to make sure p_blah
# does not already exist. And we register namespaces as we go, i.e.
# we may first see a triple with prefix _9 - rewrite it to p_9
# and then later find a triple with a "real" p_9 prefix
# so we need to keep track of ns rewrites we made so far.
if (prefix > "" and prefix[0] == "_") or self.namespaces.get(
prefix, namespace
) != namespace:
if prefix not in self._ns_rewrite:
p = "p" + prefix
while p in self.namespaces:
p = "p" + p
self._ns_rewrite[prefix] = p
prefix = self._ns_rewrite.get(prefix, prefix)
super(TurtleSerializer, self).addNamespace(prefix, namespace)
return prefix
def reset(self) -> None:
super(TurtleSerializer, self).reset()
# typing as Dict[None, None] because nothing seems to be using it
self._shortNames: Dict[None, None] = {}
self._started = False
self._ns_rewrite = {}
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
spacious: Optional[bool] = None,
**kwargs: Any,
) -> None:
self.reset()
self.stream = stream
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
if spacious is not None:
self._spacious = spacious
self.preprocess()
subjects_list = self.orderSubjects()
self.startDocument()
firstTime = True
for subject in subjects_list:
if self.isDone(subject):
continue
if firstTime:
firstTime = False
if self.statement(subject) and not firstTime:
self.write("\n")
self.endDocument()
stream.write("\n".encode("latin-1"))
self.base = None
def preprocessTriple(self, triple: _TripleType) -> None:
super(TurtleSerializer, self).preprocessTriple(triple)
for i, node in enumerate(triple):
if i == VERB and node in self.keywords:
# predicate is a keyword
continue
# Don't use generated prefixes for subjects and objects
self.getQName(node, gen_prefix=(i == VERB))
if isinstance(node, Literal) and node.datatype:
self.getQName(node.datatype, gen_prefix=_GEN_QNAME_FOR_DT)
p = triple[1]
if isinstance(p, BNode): # hmm - when is P ever a bnode?
self._references[p] += 1
# TODO: Rename to get_pname
def getQName(self, uri: Node, gen_prefix: bool = True) -> Optional[str]:
if not isinstance(uri, URIRef):
return None
parts = None
try:
parts = self.store.compute_qname(uri, generate=gen_prefix)
except Exception:
# is the uri a namespace in itself?
pfx = self.store.store.prefix(uri)
if pfx is not None:
parts = (pfx, uri, "")
else:
# nothing worked
return None
prefix, namespace, local = parts
local = local.replace(r"(", r"\(").replace(r")", r"\)")
# QName cannot end with .
if local.endswith("."):
return None
prefix = self.addNamespace(prefix, namespace)
return "%s:%s" % (prefix, local)
def startDocument(self) -> None:
self._started = True
ns_list = sorted(self.namespaces.items())
if self.base:
self.write(self.indent() + "@base <%s> .\n" % self.base)
for prefix, uri in ns_list:
self.write(self.indent() + "@prefix %s: <%s> .\n" % (prefix, uri))
if ns_list and self._spacious:
self.write("\n")
def endDocument(self) -> None:
if self._spacious:
self.write("\n")
def statement(self, subject: _SubjectType) -> bool:
self.subjectDone(subject)
return self.s_squared(subject) or self.s_default(subject)
def s_default(self, subject: _SubjectType) -> bool:
self.write("\n" + self.indent())
self.path(subject, SUBJECT)
self.predicateList(subject)
self.write(" .")
return True
def s_squared(self, subject: _SubjectType) -> bool:
if (self._references[subject] > 0) or not isinstance(subject, BNode):
return False
self.write("\n" + self.indent() + "[]")
self.predicateList(subject)
self.write(" .")
return True
def path(self, node: Node, position: int, newline: bool = False) -> None:
if not (
self.p_squared(node, position, newline)
or self.p_default(node, position, newline)
):
raise Error("Cannot serialize node '%s'" % (node,))
def p_default(self, node: Node, position: int, newline: bool = False) -> bool:
if position != SUBJECT and not newline:
self.write(" ")
self.write(self.label(node, position))
return True
def label(self, node: Node, position: int) -> str:
if node == RDF.nil:
return "()"
if position is VERB and node in self.keywords:
return self.keywords[node]
if isinstance(node, Literal):
return node._literal_n3(
use_plain=True,
qname_callback=lambda dt: self.getQName(dt, _GEN_QNAME_FOR_DT),
)
else:
node = self.relativize(node) # type: ignore[type-var]
return self.getQName(node, position == VERB) or node.n3()
def p_squared(self, node: Node, position: int, newline: bool = False) -> bool:
if (
not isinstance(node, BNode)
or node in self._serialized
or self._references[node] > 1
or position == SUBJECT
):
return False
if not newline:
self.write(" ")
if self.isValidList(node):
# this is a list
self.write("(")
self.depth += 1 # 2
self.doList(node)
self.depth -= 1 # 2
self.write(" )")
else:
self.subjectDone(node)
self.depth += 2
# self.write('[\n' + self.indent())
self.write("[")
self.depth -= 1
# self.predicateList(node, newline=True)
self.predicateList(node, newline=False)
# self.write('\n' + self.indent() + ']')
self.write(" ]")
self.depth -= 1
return True
def isValidList(self, l_: Node) -> bool:
"""
Checks if l is a valid RDF list, i.e. no nodes have other properties.
"""
try:
if self.store.value(l_, RDF.first) is None:
return False
except Exception:
return False
while l_:
if l_ != RDF.nil and len(list(self.store.predicate_objects(l_))) != 2:
return False
# type error: Incompatible types in assignment (expression has type "Optional[Node]", variable has type "Node")
l_ = self.store.value(l_, RDF.rest) # type: ignore[assignment]
return True
def doList(self, l_: Node) -> None:
while l_:
item = self.store.value(l_, RDF.first)
if item is not None:
self.path(item, OBJECT)
self.subjectDone(l_)
# type error: Incompatible types in assignment (expression has type "Optional[Node]", variable has type "Node")
l_ = self.store.value(l_, RDF.rest) # type: ignore[assignment]
def predicateList(self, subject: Node, newline: bool = False) -> None:
properties = self.buildPredicateHash(subject)
propList = self.sortProperties(properties)
if len(propList) == 0:
return
self.verb(propList[0], newline=newline)
self.objectList(properties[propList[0]])
for predicate in propList[1:]:
self.write(" ;\n" + self.indent(1))
self.verb(predicate, newline=True)
self.objectList(properties[predicate])
def verb(self, node: Node, newline: bool = False) -> None:
self.path(node, VERB, newline)
def objectList(self, objects: Sequence[Node]) -> None:
count = len(objects)
if count == 0:
return
depthmod = (count == 1) and 0 or 1
self.depth += depthmod
self.path(objects[0], OBJECT)
for obj in objects[1:]:
self.write(",\n" + self.indent(1))
self.path(obj, OBJECT, newline=True)
self.depth -= depthmod
@@ -0,0 +1,128 @@
from __future__ import annotations
import codecs
from typing import IO, TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
from xml.sax.saxutils import escape, quoteattr
from rdflib.term import URIRef
if TYPE_CHECKING:
from rdflib.namespace import Namespace, NamespaceManager
__all__ = ["XMLWriter"]
ESCAPE_ENTITIES = {"\r": "&#13;"}
class XMLWriter:
def __init__(
self,
stream: IO[bytes],
namespace_manager: NamespaceManager,
encoding: Optional[str] = None,
decl: int = 1,
extra_ns: Optional[Dict[str, Namespace]] = None,
):
encoding = encoding or "utf-8"
encoder, decoder, stream_reader, stream_writer = codecs.lookup(encoding)
# NOTE on type ignores: this is mainly because the variable is being re-used.
# type error: Incompatible types in assignment (expression has type "StreamWriter", variable has type "IO[bytes]")
self.stream = stream = stream_writer(stream) # type: ignore[assignment]
if decl:
# type error: No overload variant of "write" of "IO" matches argument type "str"
stream.write('<?xml version="1.0" encoding="%s"?>' % encoding) # type: ignore[call-overload]
self.element_stack: List[str] = []
self.nm = namespace_manager
self.extra_ns = extra_ns or {}
self.closed = True
def __get_indent(self) -> str:
return " " * len(self.element_stack)
indent = property(__get_indent)
def __close_start_tag(self) -> None:
if not self.closed: # TODO:
self.closed = True
self.stream.write(">")
def push(self, uri: str) -> None:
self.__close_start_tag()
write = self.stream.write
write("\n")
write(self.indent)
write("<%s" % self.qname(uri))
self.element_stack.append(uri)
self.closed = False
self.parent = False
def pop(self, uri: Optional[str] = None) -> None:
top = self.element_stack.pop()
if uri:
assert uri == top
write = self.stream.write
if not self.closed:
self.closed = True
write("/>")
else:
if self.parent:
write("\n")
write(self.indent)
write("</%s>" % self.qname(top))
self.parent = True
def element(
self, uri: str, content: str, attributes: Dict[URIRef, str] = {}
) -> None:
"""Utility method for adding a complete simple element"""
self.push(uri)
for k, v in attributes.items():
self.attribute(k, v)
self.text(content)
self.pop()
def namespaces(self, namespaces: Iterable[Tuple[str, str]] = None) -> None:
if not namespaces:
namespaces = self.nm.namespaces()
write = self.stream.write
write("\n")
for prefix, namespace in namespaces:
if prefix:
write(' xmlns:%s="%s"\n' % (prefix, namespace))
# Allow user-provided namespace bindings to prevail
elif prefix not in self.extra_ns:
write(' xmlns="%s"\n' % namespace)
for prefix, namespace in self.extra_ns.items():
if prefix:
write(' xmlns:%s="%s"\n' % (prefix, namespace))
else:
write(' xmlns="%s"\n' % namespace)
def attribute(self, uri: str, value: str) -> None:
write = self.stream.write
write(" %s=%s" % (self.qname(uri), quoteattr(value)))
def text(self, text: str) -> None:
self.__close_start_tag()
if "<" in text and ">" in text and "]]>" not in text:
self.stream.write("<![CDATA[")
self.stream.write(text)
self.stream.write("]]>")
else:
self.stream.write(escape(text, ESCAPE_ENTITIES))
def qname(self, uri: str) -> str:
"""Compute qname for a uri using our extra namespaces,
or the given namespace manager"""
for pre, ns in self.extra_ns.items():
if uri.startswith(ns):
if pre != "":
return ":".join([pre, uri[len(ns) :]])
else:
return uri[len(ns) :]
return self.nm.qname_strict(uri)
@@ -0,0 +1,676 @@
"""
Implementation of the JSON-LD Context structure. See:
http://json-ld.org/
"""
# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/context.py
from __future__ import annotations
from collections import namedtuple
from typing import (
TYPE_CHECKING,
Any,
Collection,
Dict,
Generator,
List,
Optional,
Set,
Tuple,
Union,
)
from urllib.parse import urljoin, urlsplit
from rdflib.namespace import RDF
from .errors import (
INVALID_CONTEXT_ENTRY,
INVALID_REMOTE_CONTEXT,
RECURSIVE_CONTEXT_INCLUSION,
)
from .keys import (
BASE,
CONTAINER,
CONTEXT,
GRAPH,
ID,
IMPORT,
INCLUDED,
INDEX,
JSON,
LANG,
LIST,
NEST,
NONE,
PREFIX,
PROPAGATE,
PROTECTED,
REV,
SET,
TYPE,
VALUE,
VERSION,
VOCAB,
)
from .util import norm_url, source_to_json, split_iri
NODE_KEYS = {GRAPH, ID, INCLUDED, JSON, LIST, NEST, NONE, REV, SET, TYPE, VALUE, LANG}
class Defined(int):
pass
UNDEF = Defined(0)
# From <https://tools.ietf.org/html/rfc3986#section-2.2>
URI_GEN_DELIMS = (":", "/", "?", "#", "[", "]", "@")
_ContextSourceType = Union[
List[Union[Dict[str, Any], str, None]], Dict[str, Any], str, None
]
class Context:
def __init__(
self,
source: _ContextSourceType = None,
base: Optional[str] = None,
version: Optional[float] = 1.1,
):
self.version: float = version or 1.1
self.language = None
self.vocab: Optional[str] = None
self._base: Optional[str]
self.base = base
self.doc_base = base
self.terms: Dict[str, Any] = {}
# _alias maps NODE_KEY to list of aliases
self._alias: Dict[str, List[str]] = {}
self._lookup: Dict[Tuple[str, Any, Union[Defined, str], bool], Term] = {}
self._prefixes: Dict[str, Any] = {}
self.active = False
self.parent: Optional[Context] = None
self.propagate = True
self._context_cache: Dict[str, Any] = {}
if source:
self.load(source)
@property
def base(self) -> Optional[str]:
return self._base
@base.setter
def base(self, base: Optional[str]):
if base:
hash_index = base.find("#")
if hash_index > -1:
base = base[0:hash_index]
self._base = (
self.resolve_iri(base)
if (hasattr(self, "_base") and base is not None)
else base
)
self._basedomain = "%s://%s" % urlsplit(base)[0:2] if base else None
def subcontext(self, source: Any, propagate: bool = True) -> Context:
# IMPROVE: to optimize, implement SubContext with parent fallback support
parent = self.parent if self.propagate is False else self
# type error: Item "None" of "Optional[Context]" has no attribute "_subcontext"
return parent._subcontext(source, propagate) # type: ignore[union-attr]
def _subcontext(self, source: Any, propagate: bool) -> Context:
ctx = Context(version=self.version)
ctx.propagate = propagate
ctx.parent = self
ctx.language = self.language
ctx.vocab = self.vocab
ctx.base = self.base
ctx.doc_base = self.doc_base
ctx._alias = {k: l[:] for k, l in self._alias.items()} # noqa: E741
ctx.terms = self.terms.copy()
ctx._lookup = self._lookup.copy()
ctx._prefixes = self._prefixes.copy()
ctx._context_cache = self._context_cache
ctx.load(source)
return ctx
def _clear(self) -> None:
self.language = None
self.vocab = None
self.terms = {}
self._alias = {}
self._lookup = {}
self._prefixes = {}
self.active = False
self.propagate = True
def get_context_for_term(self, term: Optional[Term]) -> Context:
if term and term.context is not UNDEF:
return self._subcontext(term.context, propagate=True)
return self
def get_context_for_type(self, node: Any) -> Optional[Context]:
if self.version >= 1.1:
rtype = self.get_type(node) if isinstance(node, dict) else None
if not isinstance(rtype, list):
rtype = [rtype] if rtype else []
typeterm = None
for rt in rtype:
try:
typeterm = self.terms.get(rt)
except TypeError:
# extra lenience, triggers if type is set to a literal
pass
if typeterm is not None:
break
if typeterm and typeterm.context:
subcontext = self.subcontext(typeterm.context, propagate=False)
if subcontext:
return subcontext
return self.parent if self.propagate is False else self
def get_id(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, ID)
def get_type(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, TYPE)
def get_language(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, LANG)
def get_value(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, VALUE)
def get_graph(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, GRAPH)
def get_list(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, LIST)
def get_set(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, SET)
def get_rev(self, obj: Dict[str, Any]) -> Any:
return self._get(obj, REV)
def _get(self, obj: Dict[str, Any], key: str) -> Any:
for alias in self._alias.get(key, []):
if alias in obj:
return obj.get(alias)
return obj.get(key)
# type error: Missing return statement
def get_key(self, key: str) -> str: # type: ignore[return]
for alias in self.get_keys(key):
return alias
def get_keys(self, key: str) -> Generator[str, None, None]:
if key in self._alias:
for alias in self._alias[key]:
yield alias
yield key
lang_key = property(lambda self: self.get_key(LANG))
id_key = property(lambda self: self.get_key(ID))
type_key = property(lambda self: self.get_key(TYPE))
value_key = property(lambda self: self.get_key(VALUE))
list_key = property(lambda self: self.get_key(LIST))
rev_key = property(lambda self: self.get_key(REV))
graph_key = property(lambda self: self.get_key(GRAPH))
def add_term(
self,
name: str,
idref: str,
coercion: Union[Defined, str] = UNDEF,
container: Union[Collection[Any], str, Defined] = UNDEF,
index: Optional[Union[str, Defined]] = None,
language: Optional[Union[str, Defined]] = UNDEF,
reverse: bool = False,
context: Any = UNDEF,
prefix: Optional[bool] = None,
protected: bool = False,
):
if self.version < 1.1 or prefix is None:
prefix = isinstance(idref, str) and idref.endswith(URI_GEN_DELIMS)
if not self._accept_term(name):
return
if self.version >= 1.1:
existing = self.terms.get(name)
if existing and existing.protected:
return
if isinstance(container, (list, set, tuple)):
container = set(container)
elif container is not UNDEF:
container = set([container])
else:
container = set()
term = Term(
idref,
name,
coercion,
container,
index,
language,
reverse,
context,
prefix,
protected,
)
self.terms[name] = term
container_key: Union[Defined, str]
for container_key in (LIST, LANG, SET): # , INDEX, ID, GRAPH):
if container_key in container:
break
else:
container_key = UNDEF
self._lookup[(idref, coercion or language, container_key, reverse)] = term
if term.prefix is True:
self._prefixes[idref] = name
def find_term(
self,
idref: str,
coercion: Optional[Union[str, Defined]] = None,
container: Union[Defined, str] = UNDEF,
language: Optional[str] = None,
reverse: bool = False,
):
lu = self._lookup
if coercion is None:
coercion = language
if coercion is not UNDEF and container:
found = lu.get((idref, coercion, container, reverse))
if found:
return found
if coercion is not UNDEF:
found = lu.get((idref, coercion, UNDEF, reverse))
if found:
return found
if container:
found = lu.get((idref, coercion, container, reverse))
if found:
return found
elif language:
found = lu.get((idref, UNDEF, LANG, reverse))
if found:
return found
else:
found = lu.get((idref, coercion or UNDEF, SET, reverse))
if found:
return found
return lu.get((idref, UNDEF, UNDEF, reverse))
def resolve(self, curie_or_iri: str) -> str:
iri = self.expand(curie_or_iri, False)
# type error: Argument 1 to "isblank" of "Context" has incompatible type "Optional[str]"; expected "str"
if self.isblank(iri): # type: ignore[arg-type]
# type error: Incompatible return value type (got "Optional[str]", expected "str")
return iri # type: ignore[return-value]
# type error: Unsupported right operand type for in ("Optional[str]")
if " " in iri: # type: ignore[operator]
return ""
# type error: Argument 1 to "resolve_iri" of "Context" has incompatible type "Optional[str]"; expected "str"
return self.resolve_iri(iri) # type: ignore[arg-type]
def resolve_iri(self, iri: str) -> str:
# type error: Argument 1 to "norm_url" has incompatible type "Optional[str]"; expected "str"
return norm_url(self._base, iri) # type: ignore[arg-type]
def isblank(self, ref: str) -> bool:
return ref.startswith("_:")
def expand(self, term_curie_or_iri: Any, use_vocab: bool = True) -> Optional[str]:
if not isinstance(term_curie_or_iri, str):
return term_curie_or_iri
if not self._accept_term(term_curie_or_iri):
return ""
if use_vocab:
term = self.terms.get(term_curie_or_iri)
if term:
return term.id
is_term, pfx, local = self._prep_expand(term_curie_or_iri)
if pfx == "_":
return term_curie_or_iri
if pfx is not None:
ns = self.terms.get(pfx)
if ns and ns.prefix and ns.id:
return ns.id + local
elif is_term and use_vocab:
if self.vocab:
return self.vocab + term_curie_or_iri
return None
return self.resolve_iri(term_curie_or_iri)
def shrink_iri(self, iri: str) -> str:
ns, name = split_iri(str(iri))
pfx = self._prefixes.get(ns)
if pfx:
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Any, Optional[str]]"; expected "Iterable[str]"
return ":".join((pfx, name)) # type: ignore[arg-type]
elif self._base:
if str(iri) == self._base:
return ""
# type error: Argument 1 to "startswith" of "str" has incompatible type "Optional[str]"; expected "Union[str, Tuple[str, ...]]"
elif iri.startswith(self._basedomain): # type: ignore[arg-type]
# type error: Argument 1 to "len" has incompatible type "Optional[str]"; expected "Sized"
return iri[len(self._basedomain) :] # type: ignore[arg-type]
return iri
def to_symbol(self, iri: str) -> Optional[str]:
iri = str(iri)
term = self.find_term(iri)
if term:
return term.name
ns, name = split_iri(iri)
if ns == self.vocab:
return name
pfx = self._prefixes.get(ns)
if pfx:
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Any, Optional[str]]"; expected "Iterable[str]"
return ":".join((pfx, name)) # type: ignore[arg-type]
return iri
def load(
self,
source: _ContextSourceType,
base: Optional[str] = None,
referenced_contexts: Set[Any] = None,
):
self.active = True
sources: List[Tuple[Optional[str], Union[Dict[str, Any], str, None]]] = []
# "Union[List[Union[Dict[str, Any], str]], List[Dict[str, Any]], List[str]]" : expression
# "Union[List[Dict[str, Any]], Dict[str, Any], List[str], str]" : variable
source = source if isinstance(source, list) else [source]
referenced_contexts = referenced_contexts or set()
self._prep_sources(base, source, sources, referenced_contexts)
for source_url, source in sources:
if source is None:
self._clear()
else:
# type error: Argument 1 to "_read_source" of "Context" has incompatible type "Union[Dict[str, Any], str]"; expected "Dict[str, Any]"
self._read_source(source, source_url, referenced_contexts) # type: ignore[arg-type]
def _accept_term(self, key: str) -> bool:
if self.version < 1.1:
return True
if key and len(key) > 1 and key[0] == "@" and key[1].isalnum():
return key in NODE_KEYS
else:
return True
def _prep_sources(
self,
base: Optional[str],
inputs: Union[List[Union[Dict[str, Any], str, None]], List[str]],
sources: List[Tuple[Optional[str], Union[Dict[str, Any], str, None]]],
referenced_contexts: Set[str],
in_source_url: Optional[str] = None,
):
for source in inputs:
source_url = in_source_url
new_base = base
if isinstance(source, str):
source_url = source
source_doc_base = base or self.doc_base
new_ctx = self._fetch_context(
source, source_doc_base, referenced_contexts
)
if new_ctx is None:
continue
else:
if base:
if TYPE_CHECKING:
# if base is not None, then source_doc_base won't be
# none due to how it is assigned.
assert source_doc_base is not None
new_base = urljoin(source_doc_base, source_url)
source = new_ctx
if isinstance(source, dict):
if CONTEXT in source:
source = source[CONTEXT]
# type ignore: Incompatible types in assignment (expression has type "List[Union[Dict[str, Any], str, None]]", variable has type "Union[Dict[str, Any], str, None]")
source = source if isinstance(source, list) else [source] # type: ignore[assignment]
if isinstance(source, list):
# type error: Statement is unreachable
self._prep_sources( # type: ignore[unreachable]
new_base, source, sources, referenced_contexts, source_url
)
else:
sources.append((source_url, source))
def _fetch_context(
self, source: str, base: Optional[str], referenced_contexts: Set[str]
):
# type error: Value of type variable "AnyStr" of "urljoin" cannot be "Optional[str]"
source_url = urljoin(base, source) # type: ignore[type-var]
if source_url in referenced_contexts:
raise RECURSIVE_CONTEXT_INCLUSION
# type error: Argument 1 to "add" of "set" has incompatible type "Optional[str]"; expected "str"
referenced_contexts.add(source_url) # type: ignore[arg-type]
if source_url in self._context_cache:
return self._context_cache[source_url]
# type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "str")
source_json, _ = source_to_json(source_url)
if source_json and CONTEXT not in source_json:
raise INVALID_REMOTE_CONTEXT
# type error: Invalid index type "Optional[str]" for "Dict[str, Any]"; expected type "str"
self._context_cache[source_url] = source_json # type: ignore[index]
return source_json
def _read_source(
self,
source: Dict[str, Any],
source_url: Optional[str] = None,
referenced_contexts: Optional[Set[str]] = None,
):
imports = source.get(IMPORT)
if imports:
if not isinstance(imports, str):
raise INVALID_CONTEXT_ENTRY
imported = self._fetch_context(
imports, self.base, referenced_contexts or set()
)
if not isinstance(imported, dict):
raise INVALID_CONTEXT_ENTRY
imported = imported[CONTEXT]
imported.update(source)
source = imported
self.vocab = source.get(VOCAB, self.vocab)
self.version = source.get(VERSION, self.version)
protected = source.get(PROTECTED, False)
for key, value in source.items():
if key in {VOCAB, VERSION, IMPORT, PROTECTED}:
continue
elif key == PROPAGATE and isinstance(value, bool):
self.propagate = value
elif key == LANG:
self.language = value
elif key == BASE:
if not source_url and not imports:
self.base = value
else:
self._read_term(source, key, value, protected)
def _read_term(
self,
source: Dict[str, Any],
name: str,
dfn: Union[Dict[str, Any], str],
protected: bool = False,
) -> None:
idref = None
if isinstance(dfn, dict):
# term = self._create_term(source, key, value)
rev = dfn.get(REV)
protected = dfn.get(PROTECTED, protected)
coercion = dfn.get(TYPE, UNDEF)
if coercion and coercion not in (ID, TYPE, VOCAB):
coercion = self._rec_expand(source, coercion)
idref = rev or dfn.get(ID, UNDEF)
if idref == TYPE:
idref = str(RDF.type)
coercion = VOCAB
elif idref is not UNDEF:
idref = self._rec_expand(source, idref)
elif ":" in name:
idref = self._rec_expand(source, name)
elif self.vocab:
idref = self.vocab + name
context = dfn.get(CONTEXT, UNDEF)
self.add_term(
name,
idref,
coercion,
dfn.get(CONTAINER, UNDEF),
dfn.get(INDEX, UNDEF),
dfn.get(LANG, UNDEF),
bool(rev),
context,
dfn.get(PREFIX),
protected=protected,
)
else:
if isinstance(dfn, str):
if not self._accept_term(dfn):
return
idref = self._rec_expand(source, dfn)
# type error: Argument 2 to "add_term" of "Context" has incompatible type "Optional[str]"; expected "str"
self.add_term(name, idref, protected=protected) # type: ignore[arg-type]
if idref in NODE_KEYS:
self._alias.setdefault(idref, []).append(name)
else:
# undo aliases that may have been inherited from parent context
for v in self._alias.values():
if name in v:
v.remove(name)
def _rec_expand(
self, source: Dict[str, Any], expr: Optional[str], prev: Optional[str] = None
) -> Optional[str]:
if expr == prev or expr in NODE_KEYS:
return expr
nxt: Optional[str]
# type error: Argument 1 to "_prep_expand" of "Context" has incompatible type "Optional[str]"; expected "str"
is_term, pfx, nxt = self._prep_expand(expr) # type: ignore[arg-type]
if pfx:
iri = self._get_source_id(source, pfx)
if iri is None:
if pfx + ":" == self.vocab:
return expr
else:
term = self.terms.get(pfx)
if term:
iri = term.id
if iri is None:
nxt = expr
else:
nxt = iri + nxt
else:
nxt = self._get_source_id(source, nxt) or nxt
if ":" not in nxt and self.vocab:
return self.vocab + nxt
return self._rec_expand(source, nxt, expr)
def _prep_expand(self, expr: str) -> Tuple[bool, Optional[str], str]:
if ":" not in expr:
return True, None, expr
pfx, local = expr.split(":", 1)
if not local.startswith("//"):
return False, pfx, local
else:
return False, None, expr
def _get_source_id(self, source: Dict[str, Any], key: str) -> Optional[str]:
# .. from source dict or if already defined
term = source.get(key)
if term is None:
dfn = self.terms.get(key)
if dfn:
term = dfn.id
elif isinstance(term, dict):
term = term.get(ID)
return term
def _term_dict(self, term: Term) -> Union[Dict[str, Any], str]:
tdict: Dict[str, Any] = {}
if term.type != UNDEF:
tdict[TYPE] = self.shrink_iri(term.type)
if term.container:
tdict[CONTAINER] = list(term.container)
if term.language != UNDEF:
tdict[LANG] = term.language
if term.reverse:
tdict[REV] = term.id
else:
tdict[ID] = term.id
if tdict.keys() == {ID}:
return tdict[ID]
return tdict
def to_dict(self) -> Dict[str, Any]:
"""
Returns a dictionary representation of the context that can be
serialized to JSON.
:return: a dictionary representation of the context.
"""
r = {v: k for (k, v) in self._prefixes.items()}
r.update({term.name: self._term_dict(term) for term in self._lookup.values()})
if self.base:
r[BASE] = self.base
if self.language:
r[LANG] = self.language
return r
Term = namedtuple(
"Term",
"id, name, type, container, index, language, reverse, context," "prefix, protected",
)
Term.__new__.__defaults__ = (UNDEF, UNDEF, UNDEF, UNDEF, False, UNDEF, False, False)
@@ -0,0 +1,9 @@
# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/errors.py
class JSONLDException(ValueError): # noqa: N818
pass
# http://www.w3.org/TR/json-ld-api/#idl-def-JsonLdErrorCode.{code-message}
RECURSIVE_CONTEXT_INCLUSION = JSONLDException("recursive context inclusion")
INVALID_REMOTE_CONTEXT = JSONLDException("invalid remote context")
INVALID_CONTEXT_ENTRY = JSONLDException("invalid context entry")
@@ -0,0 +1,24 @@
# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/keys.py
BASE = "@base"
CONTAINER = "@container"
CONTEXT = "@context"
# DIRECTION = u'@direction'
GRAPH = "@graph"
ID = "@id"
IMPORT = "@import"
INCLUDED = "@included"
INDEX = "@index"
JSON = "@json"
LANG = LANGUAGE = "@language"
LIST = "@list"
NEST = "@nest"
NONE = "@none"
PREFIX = "@prefix"
PROPAGATE = "@propagate"
PROTECTED = "@protected"
REV = REVERSE = "@reverse"
SET = "@set"
TYPE = "@type"
VALUE = "@value"
VERSION = "@version"
VOCAB = "@vocab"
@@ -0,0 +1,355 @@
# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/util.py
from __future__ import annotations
import json
import pathlib
from html.parser import HTMLParser
from io import StringIO, TextIOBase, TextIOWrapper
from typing import IO, TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Tuple, Union
if TYPE_CHECKING:
import json
else:
try:
import json
assert json # workaround for pyflakes issue #13
except ImportError:
import simplejson as json
from posixpath import normpath, sep
from typing import TYPE_CHECKING, cast
from urllib.parse import urljoin, urlsplit, urlunsplit
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
from rdflib.parser import (
BytesIOWrapper,
InputSource,
PythonInputSource,
StringInputSource,
URLInputSource,
create_input_source,
)
def source_to_json(
source: Optional[
Union[IO[bytes], TextIO, InputSource, str, bytes, pathlib.PurePath]
],
fragment_id: Optional[str] = None,
extract_all_scripts: Optional[bool] = False,
) -> Tuple[Union[Dict, List[Dict]], Any]:
"""Extract JSON from a source document.
The source document can be JSON or HTML with embedded JSON script elements (type attribute = "application/ld+json").
To process as HTML ``source.content_type`` must be set to "text/html" or "application/xhtml+xml".
:param source: the input source document (JSON or HTML)
:param fragment_id: if source is an HTML document then extract only the script element with matching id attribute, defaults to None
:param extract_all_scripts: if source is an HTML document then extract all script elements (unless fragment_id is provided), defaults to False (extract only the first script element)
:return: Tuple with the extracted JSON document and value of the HTML base element
"""
if isinstance(source, PythonInputSource):
return source.data, None
if isinstance(source, StringInputSource):
# A StringInputSource is assumed to be never a HTMLJSON doc
html_base: Any = None
# We can get the original string from the StringInputSource
# It's hidden in the BytesIOWrapper 'wrapped' attribute
b_stream = source.getByteStream()
original_string: Optional[str] = None
json_dict: Union[Dict, List[Dict]]
if isinstance(b_stream, BytesIOWrapper):
wrapped_inner = cast(Union[str, StringIO, TextIOBase], b_stream.wrapped)
if isinstance(wrapped_inner, str):
original_string = wrapped_inner
elif isinstance(wrapped_inner, StringIO):
original_string = wrapped_inner.getvalue()
if _HAS_ORJSON:
if original_string is not None:
json_dict = orjson.loads(original_string)
elif isinstance(b_stream, BytesIOWrapper):
# use the CharacterStream instead
c_stream = source.getCharacterStream()
json_dict = orjson.loads(c_stream.read())
else:
# orjson assumes its in utf-8 encoding so
# don't bother to check the source.getEncoding()
json_dict = orjson.loads(b_stream.read())
else:
if original_string is not None:
json_dict = json.loads(original_string)
else:
json_dict = json.load(source.getCharacterStream())
return json_dict, html_base
# TODO: conneg for JSON (fix support in rdflib's URLInputSource!)
source = create_input_source(source, format="json-ld")
try:
content_type = source.content_type
except (AttributeError, LookupError):
content_type = None
is_html = content_type is not None and content_type.lower() in (
"text/html",
"application/xhtml+xml",
)
if is_html:
html_docparser: Optional[HTMLJSONParser] = HTMLJSONParser(
fragment_id=fragment_id, extract_all_scripts=extract_all_scripts
)
else:
html_docparser = None
try:
b_stream = source.getByteStream()
except (AttributeError, LookupError):
b_stream = None
try:
c_stream = source.getCharacterStream()
except (AttributeError, LookupError):
c_stream = None
if b_stream is None and c_stream is None:
raise ValueError(
f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
)
try:
b_encoding: Optional[str] = None if b_stream is None else source.getEncoding()
except (AttributeError, LookupError):
b_encoding = None
underlying_string: Optional[str] = None
if b_stream is not None and isinstance(b_stream, BytesIOWrapper):
# Try to find an underlying wrapped Unicode string to use?
wrapped_inner = b_stream.wrapped
if isinstance(wrapped_inner, str):
underlying_string = wrapped_inner
elif isinstance(wrapped_inner, StringIO):
underlying_string = wrapped_inner.getvalue()
try:
if is_html and html_docparser is not None:
# Offload parsing to the HTMLJSONParser
if underlying_string is not None:
html_string: str = underlying_string
elif c_stream is not None:
html_string = c_stream.read()
else:
if TYPE_CHECKING:
assert b_stream is not None
if b_encoding is None:
b_encoding = "utf-8"
html_string = TextIOWrapper(b_stream, encoding=b_encoding).read()
html_docparser.feed(html_string)
json_dict, html_base = html_docparser.get_json(), html_docparser.get_base()
elif _HAS_ORJSON:
html_base = None
if underlying_string is not None:
json_dict = orjson.loads(underlying_string)
elif (
(b_stream is not None and isinstance(b_stream, BytesIOWrapper))
or b_stream is None
) and c_stream is not None:
# use the CharacterStream instead
json_dict = orjson.loads(c_stream.read())
else:
if TYPE_CHECKING:
assert b_stream is not None
# b_stream is not None
json_dict = orjson.loads(b_stream.read())
else:
html_base = None
if underlying_string is not None:
return json.loads(underlying_string)
if c_stream is not None:
use_stream = c_stream
else:
if TYPE_CHECKING:
assert b_stream is not None
# b_stream is not None
if b_encoding is None:
b_encoding = "utf-8"
use_stream = TextIOWrapper(b_stream, encoding=b_encoding)
json_dict = json.load(use_stream)
return json_dict, html_base
finally:
if b_stream is not None:
try:
b_stream.close()
except AttributeError:
pass
if c_stream is not None:
try:
c_stream.close()
except AttributeError:
pass
VOCAB_DELIMS = ("#", "/", ":")
def split_iri(iri: str) -> Tuple[str, Optional[str]]:
for delim in VOCAB_DELIMS:
at = iri.rfind(delim)
if at > -1:
return iri[: at + 1], iri[at + 1 :]
return iri, None
def norm_url(base: str, url: str) -> str:
"""
>>> norm_url('http://example.org/', '/one')
'http://example.org/one'
>>> norm_url('http://example.org/', '/one#')
'http://example.org/one#'
>>> norm_url('http://example.org/one', 'two')
'http://example.org/two'
>>> norm_url('http://example.org/one/', 'two')
'http://example.org/one/two'
>>> norm_url('http://example.org/', 'http://example.net/one')
'http://example.net/one'
>>> norm_url('http://example.org/', 'http://example.org//one')
'http://example.org//one'
"""
if "://" in url:
return url
# Fix for URNs
parsed_base = urlsplit(base)
parsed_url = urlsplit(url)
if parsed_url.scheme:
# Assume full URL
return url
if parsed_base.scheme in ("urn", "urn-x"):
# No scheme -> assume relative and join paths
base_path_parts = parsed_base.path.split("/", 1)
base_path = "/" + (base_path_parts[1] if len(base_path_parts) > 1 else "")
joined_path = urljoin(base_path, parsed_url.path)
fragment = f"#{parsed_url.fragment}" if parsed_url.fragment else ""
result = f"{parsed_base.scheme}:{base_path_parts[0]}{joined_path}{fragment}"
else:
parts = urlsplit(urljoin(base, url))
path = normpath(parts[2])
if sep != "/":
path = "/".join(path.split(sep))
if parts[2].endswith("/") and not path.endswith("/"):
path += "/"
result = urlunsplit(parts[0:2] + (path,) + parts[3:])
if url.endswith("#") and not result.endswith("#"):
result += "#"
return result
# type error: Missing return statement
def context_from_urlinputsource(source: URLInputSource) -> Optional[str]: # type: ignore[return]
"""
Please note that JSON-LD documents served with the application/ld+json media type
MUST have all context information, including references to external contexts,
within the body of the document. Contexts linked via a
http://www.w3.org/ns/json-ld#context HTTP Link Header MUST be
ignored for such documents.
"""
if source.content_type != "application/ld+json":
try:
# source.links is the new way of getting Link headers from URLInputSource
links = source.links
except AttributeError:
# type error: Return value expected
return # type: ignore[return-value]
for link in links:
if ' rel="http://www.w3.org/ns/json-ld#context"' in link:
i, j = link.index("<"), link.index(">")
if i > -1 and j > -1:
# type error: Value of type variable "AnyStr" of "urljoin" cannot be "Optional[str]"
return urljoin(source.url, link[i + 1 : j]) # type: ignore[type-var]
__all__ = [
"json",
"source_to_json",
"split_iri",
"norm_url",
"context_from_urlinputsource",
"orjson",
"_HAS_ORJSON",
]
class HTMLJSONParser(HTMLParser):
def __init__(
self,
fragment_id: Optional[str] = None,
extract_all_scripts: Optional[bool] = False,
):
super().__init__()
self.fragment_id = fragment_id
self.json: List[Dict] = []
self.contains_json = False
self.fragment_id_does_not_match = False
self.base = None
self.extract_all_scripts = extract_all_scripts
self.script_count = 0
def handle_starttag(self, tag, attrs):
self.contains_json = False
self.fragment_id_does_not_match = False
# Only set self. contains_json to True if the
# type is 'application/ld+json'
if tag == "script":
for attr, value in attrs:
if attr == "type" and value == "application/ld+json":
self.contains_json = True
elif attr == "id" and self.fragment_id and value != self.fragment_id:
self.fragment_id_does_not_match = True
elif tag == "base":
for attr, value in attrs:
if attr == "href":
self.base = value
def handle_data(self, data):
# Only do something when we know the context is a
# script element containing application/ld+json
if self.contains_json is True and self.fragment_id_does_not_match is False:
if not self.extract_all_scripts and self.script_count > 0:
return
if data.strip() == "":
# skip empty data elements
return
# Try to parse the json
if _HAS_ORJSON:
# orjson can load a unicode string
# if that's the only thing we have,
# its not worth encoding it to bytes
parsed = orjson.loads(data)
else:
parsed = json.loads(data)
# Add to the result document
if isinstance(parsed, list):
self.json.extend(parsed)
else:
self.json.append(parsed)
self.script_count += 1
def get_json(self) -> List[Dict]:
return self.json
def get_base(self):
return self.base
@@ -0,0 +1,63 @@
"""
SPARQL implementation for RDFLib
.. versionadded:: 4.0
"""
from importlib.metadata import entry_points
from typing import TYPE_CHECKING
SPARQL_LOAD_GRAPHS = True
"""
If True, using FROM <uri> and FROM NAMED <uri>
will load/parse more data
"""
SPARQL_DEFAULT_GRAPH_UNION = True
"""
If True - the default graph in the RDF Dataset is the union of all
named graphs (like RDFLib's ConjunctiveGraph)
"""
CUSTOM_EVALS = {}
"""
Custom evaluation functions
These must be functions taking (ctx, part) and raise
NotImplementedError if they cannot handle a certain part
"""
PLUGIN_ENTRY_POINT = "rdf.plugins.sparqleval"
from . import operators, parser, parserutils
from .processor import prepareQuery, prepareUpdate, processUpdate
assert parser
assert operators
assert parserutils
all_entry_points = entry_points()
if hasattr(all_entry_points, "select"):
for ep in all_entry_points.select(group=PLUGIN_ENTRY_POINT):
CUSTOM_EVALS[ep.name] = ep.load()
else:
# Prior to Python 3.10, this returns a dict instead of the selection interface
if TYPE_CHECKING:
assert isinstance(all_entry_points, dict)
for ep in all_entry_points.get(PLUGIN_ENTRY_POINT, []):
CUSTOM_EVALS[ep.name] = ep.load()
__all__ = [
"prepareQuery",
"prepareUpdate",
"processUpdate",
"operators",
"parser",
"parserutils",
"CUSTOM_EVALS",
]
@@ -0,0 +1,316 @@
"""
Aggregation functions
"""
from __future__ import annotations
from decimal import Decimal
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Mapping,
MutableMapping,
Optional,
Set,
Tuple,
TypeVar,
Union,
overload,
)
from rdflib.namespace import XSD
from rdflib.plugins.sparql.datatypes import type_promotion
from rdflib.plugins.sparql.evalutils import _eval, _val
from rdflib.plugins.sparql.operators import numeric
from rdflib.plugins.sparql.parserutils import CompValue
from rdflib.plugins.sparql.sparql import FrozenBindings, NotBoundError, SPARQLTypeError
from rdflib.term import BNode, Identifier, Literal, URIRef, Variable
class Accumulator:
"""abstract base class for different aggregation functions"""
def __init__(self, aggregation: CompValue):
self.get_value: Callable[[], Optional[Literal]]
self.update: Callable[[FrozenBindings, Aggregator], None]
self.var = aggregation.res
self.expr = aggregation.vars
if not aggregation.distinct:
# type error: Cannot assign to a method
self.use_row = self.dont_care # type: ignore[method-assign]
self.distinct = False
else:
self.distinct = aggregation.distinct
self.seen: Set[Any] = set()
def dont_care(self, row: FrozenBindings) -> bool:
"""skips distinct test"""
return True
def use_row(self, row: FrozenBindings) -> bool:
"""tests distinct with set"""
return _eval(self.expr, row) not in self.seen
def set_value(self, bindings: MutableMapping[Variable, Identifier]) -> None:
"""sets final value in bindings"""
# type error: Incompatible types in assignment (expression has type "Optional[Literal]", target has type "Identifier")
bindings[self.var] = self.get_value() # type: ignore[assignment]
class Counter(Accumulator):
def __init__(self, aggregation: CompValue):
super(Counter, self).__init__(aggregation)
self.value = 0
if self.expr == "*":
# cannot eval "*" => always use the full row
# type error: Cannot assign to a method
self.eval_row = self.eval_full_row # type: ignore[assignment]
def update(self, row: FrozenBindings, aggregator: Aggregator) -> None:
try:
val = self.eval_row(row)
except NotBoundError:
# skip UNDEF
return
self.value += 1
if self.distinct:
self.seen.add(val)
def get_value(self) -> Literal:
return Literal(self.value)
def eval_row(self, row: FrozenBindings) -> Identifier:
return _eval(self.expr, row)
def eval_full_row(self, row: FrozenBindings) -> FrozenBindings:
return row
def use_row(self, row: FrozenBindings) -> bool:
try:
return self.eval_row(row) not in self.seen
except NotBoundError:
# happens when counting zero optional nodes. See issue #2229
return False
@overload
def type_safe_numbers(*args: int) -> Tuple[int]: ...
@overload
def type_safe_numbers(
*args: Union[Decimal, float, int]
) -> Tuple[Union[float, int]]: ...
def type_safe_numbers(*args: Union[Decimal, float, int]) -> Iterable[Union[float, int]]:
if any(isinstance(arg, float) for arg in args) and any(
isinstance(arg, Decimal) for arg in args
):
return map(float, args)
# type error: Incompatible return value type (got "Tuple[Union[Decimal, float, int], ...]", expected "Iterable[Union[float, int]]")
# NOTE on type error: if args contains a Decimal it will nopt get here.
return args # type: ignore[return-value]
class Sum(Accumulator):
def __init__(self, aggregation: CompValue):
super(Sum, self).__init__(aggregation)
self.value = 0
self.datatype: Optional[str] = None
def update(self, row: FrozenBindings, aggregator: Aggregator) -> None:
try:
value = _eval(self.expr, row)
dt = self.datatype
if dt is None:
dt = value.datatype
else:
# type error: Argument 1 to "type_promotion" has incompatible type "str"; expected "URIRef"
dt = type_promotion(dt, value.datatype) # type: ignore[arg-type]
self.datatype = dt
self.value = sum(type_safe_numbers(self.value, numeric(value)))
if self.distinct:
self.seen.add(value)
except NotBoundError:
# skip UNDEF
pass
def get_value(self) -> Literal:
return Literal(self.value, datatype=self.datatype)
class Average(Accumulator):
def __init__(self, aggregation: CompValue):
super(Average, self).__init__(aggregation)
self.counter = 0
self.sum = 0
self.datatype: Optional[str] = None
def update(self, row: FrozenBindings, aggregator: Aggregator) -> None:
try:
value = _eval(self.expr, row)
dt = self.datatype
self.sum = sum(type_safe_numbers(self.sum, numeric(value)))
if dt is None:
dt = value.datatype
else:
# type error: Argument 1 to "type_promotion" has incompatible type "str"; expected "URIRef"
dt = type_promotion(dt, value.datatype) # type: ignore[arg-type]
self.datatype = dt
if self.distinct:
self.seen.add(value)
self.counter += 1
# skip UNDEF or BNode => SPARQLTypeError
except NotBoundError:
pass
except SPARQLTypeError:
pass
def get_value(self) -> Literal:
if self.counter == 0:
return Literal(0)
if self.datatype in (XSD.float, XSD.double):
return Literal(self.sum / self.counter)
else:
return Literal(Decimal(self.sum) / Decimal(self.counter))
class Extremum(Accumulator):
"""abstract base class for Minimum and Maximum"""
def __init__(self, aggregation: CompValue):
self.compare: Callable[[Any, Any], Any]
super(Extremum, self).__init__(aggregation)
self.value: Any = None
# DISTINCT would not change the value for MIN or MAX
# type error: Cannot assign to a method
self.use_row = self.dont_care # type: ignore[method-assign]
def set_value(self, bindings: MutableMapping[Variable, Identifier]) -> None:
if self.value is not None:
# simply do not set if self.value is still None
bindings[self.var] = Literal(self.value)
def update(self, row: FrozenBindings, aggregator: Aggregator) -> None:
try:
if self.value is None:
self.value = _eval(self.expr, row)
else:
# self.compare is implemented by Minimum/Maximum
self.value = self.compare(self.value, _eval(self.expr, row))
# skip UNDEF or BNode => SPARQLTypeError
except NotBoundError:
pass
except SPARQLTypeError:
pass
_ValueT = TypeVar("_ValueT", Variable, BNode, URIRef, Literal)
class Minimum(Extremum):
def compare(self, val1: _ValueT, val2: _ValueT) -> _ValueT:
return min(val1, val2, key=_val)
class Maximum(Extremum):
def compare(self, val1: _ValueT, val2: _ValueT) -> _ValueT:
return max(val1, val2, key=_val)
class Sample(Accumulator):
"""takes the first eligible value"""
def __init__(self, aggregation):
super(Sample, self).__init__(aggregation)
# DISTINCT would not change the value
# type error: Cannot assign to a method
self.use_row = self.dont_care # type: ignore[method-assign]
def update(self, row: FrozenBindings, aggregator: Aggregator) -> None:
try:
# set the value now
aggregator.bindings[self.var] = _eval(self.expr, row)
# and skip this accumulator for future rows
del aggregator.accumulators[self.var]
except NotBoundError:
pass
def get_value(self) -> None:
# set None if no value was set
return None
class GroupConcat(Accumulator):
value: List[Literal]
def __init__(self, aggregation: CompValue):
super(GroupConcat, self).__init__(aggregation)
# only GROUPCONCAT needs to have a list as accumulator
self.value = []
if aggregation.separator is None:
self.separator = " "
else:
self.separator = aggregation.separator
def update(self, row: FrozenBindings, aggregator: Aggregator) -> None:
try:
value = _eval(self.expr, row)
# skip UNDEF
if isinstance(value, NotBoundError):
return
self.value.append(value)
if self.distinct:
self.seen.add(value)
# skip UNDEF
# NOTE: It seems like this is not the way undefined values occur, they
# come through not as exceptions but as values. This is left here
# however as it may occur in some cases.
# TODO: Consider removing this.
except NotBoundError:
pass
def get_value(self) -> Literal:
return Literal(self.separator.join(str(v) for v in self.value))
class Aggregator:
"""combines different Accumulator objects"""
accumulator_classes = {
"Aggregate_Count": Counter,
"Aggregate_Sample": Sample,
"Aggregate_Sum": Sum,
"Aggregate_Avg": Average,
"Aggregate_Min": Minimum,
"Aggregate_Max": Maximum,
"Aggregate_GroupConcat": GroupConcat,
}
def __init__(self, aggregations: List[CompValue]):
self.bindings: Dict[Variable, Identifier] = {}
self.accumulators: Dict[str, Accumulator] = {}
for a in aggregations:
accumulator_class = self.accumulator_classes.get(a.name)
if accumulator_class is None:
raise Exception("Unknown aggregate function " + a.name)
self.accumulators[a.res] = accumulator_class(a)
def update(self, row: FrozenBindings) -> None:
"""update all own accumulators"""
# SAMPLE accumulators may delete themselves
# => iterate over list not generator
for acc in list(self.accumulators.values()):
if acc.use_row(row):
acc.update(row, self)
def get_bindings(self) -> Mapping[Variable, Identifier]:
"""calculate and set last values"""
for acc in self.accumulators.values():
acc.set_value(self.bindings)
return self.bindings
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,102 @@
"""
Utility functions for supporting the XML Schema Datatypes hierarchy
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, List, Optional, Set
from rdflib.namespace import XSD
if TYPE_CHECKING:
from rdflib.term import URIRef
XSD_DTs: Set[URIRef] = set(
(
XSD.integer,
XSD.decimal,
XSD.float,
XSD.double,
XSD.string,
XSD.boolean,
XSD.dateTime,
XSD.nonPositiveInteger,
XSD.negativeInteger,
XSD.long,
XSD.int,
XSD.short,
XSD.byte,
XSD.nonNegativeInteger,
XSD.unsignedLong,
XSD.unsignedInt,
XSD.unsignedShort,
XSD.unsignedByte,
XSD.positiveInteger,
XSD.date,
)
)
# adding dateTime datatypes
XSD_DateTime_DTs = set((XSD.dateTime, XSD.date, XSD.time))
XSD_Duration_DTs = set((XSD.duration, XSD.dayTimeDuration, XSD.yearMonthDuration))
_sub_types: Dict[URIRef, List[URIRef]] = {
XSD.integer: [
XSD.nonPositiveInteger,
XSD.negativeInteger,
XSD.long,
XSD.int,
XSD.short,
XSD.byte,
XSD.nonNegativeInteger,
XSD.positiveInteger,
XSD.unsignedLong,
XSD.unsignedInt,
XSD.unsignedShort,
XSD.unsignedByte,
],
}
_super_types: Dict[URIRef, URIRef] = {}
for superdt in XSD_DTs:
for subdt in _sub_types.get(superdt, []):
_super_types[subdt] = superdt
# we only care about float, double, integer, decimal
_typePromotionMap: Dict[URIRef, Dict[URIRef, URIRef]] = {
XSD.float: {XSD.integer: XSD.float, XSD.decimal: XSD.float, XSD.double: XSD.double},
XSD.double: {
XSD.integer: XSD.double,
XSD.float: XSD.double,
XSD.decimal: XSD.double,
},
XSD.decimal: {
XSD.integer: XSD.decimal,
XSD.float: XSD.float,
XSD.double: XSD.double,
},
XSD.integer: {
XSD.decimal: XSD.decimal,
XSD.float: XSD.float,
XSD.double: XSD.double,
},
}
def type_promotion(t1: URIRef, t2: Optional[URIRef]) -> URIRef:
if t2 is None:
return t1
t1 = _super_types.get(t1, t1)
t2 = _super_types.get(t2, t2)
if t1 == t2:
return t1 # matching super-types
try:
if TYPE_CHECKING:
# type assert because mypy is confused and thinks t2 can be None
assert t2 is not None
return _typePromotionMap[t1][t2]
except KeyError:
raise TypeError("Operators cannot combine datatypes %s and %s" % (t1, t2))
@@ -0,0 +1,685 @@
"""
These method recursively evaluate the SPARQL Algebra
evalQuery is the entry-point, it will setup context and
return the SPARQLResult object
evalPart is called on each level and will delegate to the right method
A rdflib.plugins.sparql.sparql.QueryContext is passed along, keeping
information needed for evaluation
A list of dicts (solution mappings) is returned, apart from GroupBy which may
also return a dict of list of dicts
"""
from __future__ import annotations
import collections
import itertools
import re
from typing import (
TYPE_CHECKING,
Any,
Deque,
Dict,
Generator,
Iterable,
List,
Mapping,
Optional,
Tuple,
Union,
)
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from pyparsing import ParseException
from rdflib.graph import Graph
from rdflib.plugins.sparql import CUSTOM_EVALS, parser
from rdflib.plugins.sparql.aggregates import Aggregator
from rdflib.plugins.sparql.evalutils import (
_ebv,
_eval,
_fillTemplate,
_join,
_minus,
_val,
)
from rdflib.plugins.sparql.parserutils import CompValue, value
from rdflib.plugins.sparql.sparql import (
AlreadyBound,
FrozenBindings,
FrozenDict,
Query,
QueryContext,
SPARQLError,
)
from rdflib.term import BNode, Identifier, Literal, URIRef, Variable
if TYPE_CHECKING:
from rdflib.paths import Path
import json
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
_Triple = Tuple[Identifier, Identifier, Identifier]
def evalBGP(
ctx: QueryContext, bgp: List[_Triple]
) -> Generator[FrozenBindings, None, None]:
"""
A basic graph pattern
"""
if not bgp:
yield ctx.solution()
return
s, p, o = bgp[0]
_s = ctx[s]
_p = ctx[p]
_o = ctx[o]
# type error: Item "None" of "Optional[Graph]" has no attribute "triples"
# type Argument 1 to "triples" of "Graph" has incompatible type "Tuple[Union[str, Path, None], Union[str, Path, None], Union[str, Path, None]]"; expected "Tuple[Optional[Node], Optional[Node], Optional[Node]]"
for ss, sp, so in ctx.graph.triples((_s, _p, _o)): # type: ignore[union-attr, arg-type]
if None in (_s, _p, _o):
c = ctx.push()
else:
c = ctx
if _s is None:
# type error: Incompatible types in assignment (expression has type "Union[Node, Any]", target has type "Identifier")
c[s] = ss # type: ignore[assignment]
try:
if _p is None:
# type error: Incompatible types in assignment (expression has type "Union[Node, Any]", target has type "Identifier")
c[p] = sp # type: ignore[assignment]
except AlreadyBound:
continue
try:
if _o is None:
# type error: Incompatible types in assignment (expression has type "Union[Node, Any]", target has type "Identifier")
c[o] = so # type: ignore[assignment]
except AlreadyBound:
continue
for x in evalBGP(c, bgp[1:]):
yield x
def evalExtend(
ctx: QueryContext, extend: CompValue
) -> Generator[FrozenBindings, None, None]:
# TODO: Deal with dict returned from evalPart from GROUP BY
for c in evalPart(ctx, extend.p):
try:
e = _eval(extend.expr, c.forget(ctx, _except=extend._vars))
if isinstance(e, SPARQLError):
raise e
yield c.merge({extend.var: e})
except SPARQLError:
yield c
def evalLazyJoin(
ctx: QueryContext, join: CompValue
) -> Generator[FrozenBindings, None, None]:
"""
A lazy join will push the variables bound
in the first part to the second part,
essentially doing the join implicitly
hopefully evaluating much fewer triples
"""
for a in evalPart(ctx, join.p1):
c = ctx.thaw(a)
for b in evalPart(c, join.p2):
yield b.merge(a) # merge, as some bindings may have been forgotten
def evalJoin(ctx: QueryContext, join: CompValue) -> Generator[FrozenDict, None, None]:
# TODO: Deal with dict returned from evalPart from GROUP BY
# only ever for join.p1
if join.lazy:
return evalLazyJoin(ctx, join)
else:
a = evalPart(ctx, join.p1)
b = set(evalPart(ctx, join.p2))
return _join(a, b)
def evalUnion(ctx: QueryContext, union: CompValue) -> List[Any]:
branch1_branch2 = []
for x in evalPart(ctx, union.p1):
branch1_branch2.append(x)
for x in evalPart(ctx, union.p2):
branch1_branch2.append(x)
return branch1_branch2
def evalMinus(ctx: QueryContext, minus: CompValue) -> Generator[FrozenDict, None, None]:
a = evalPart(ctx, minus.p1)
b = set(evalPart(ctx, minus.p2))
return _minus(a, b)
def evalLeftJoin(
ctx: QueryContext, join: CompValue
) -> Generator[FrozenBindings, None, None]:
# import pdb; pdb.set_trace()
for a in evalPart(ctx, join.p1):
ok = False
c = ctx.thaw(a)
for b in evalPart(c, join.p2):
if _ebv(join.expr, b.forget(ctx)):
ok = True
yield b
if not ok:
# we've cheated, the ctx above may contain
# vars bound outside our scope
# before we yield a solution without the OPTIONAL part
# check that we would have had no OPTIONAL matches
# even without prior bindings...
p1_vars = join.p1._vars
if p1_vars is None or not any(
_ebv(join.expr, b)
for b in evalPart(ctx.thaw(a.remember(p1_vars)), join.p2)
):
yield a
def evalFilter(
ctx: QueryContext, part: CompValue
) -> Generator[FrozenBindings, None, None]:
# TODO: Deal with dict returned from evalPart!
for c in evalPart(ctx, part.p):
if _ebv(
part.expr,
c.forget(ctx, _except=part._vars) if not part.no_isolated_scope else c,
):
yield c
def evalGraph(
ctx: QueryContext, part: CompValue
) -> Generator[FrozenBindings, None, None]:
if ctx.dataset is None:
raise Exception(
"Non-conjunctive-graph doesn't know about "
+ "graphs. Try a query without GRAPH."
)
ctx = ctx.clone()
graph: Union[str, Path, None, Graph] = ctx[part.term]
prev_graph = ctx.graph
if graph is None:
for graph in ctx.dataset.contexts():
# in SPARQL the default graph is NOT a named graph
if graph == ctx.dataset.default_context:
continue
c = ctx.pushGraph(graph)
c = c.push()
graphSolution = [{part.term: graph.identifier}]
for x in _join(evalPart(c, part.p), graphSolution):
x.ctx.graph = prev_graph
yield x
else:
if TYPE_CHECKING:
assert not isinstance(graph, Graph)
# type error: Argument 1 to "get_context" of "ConjunctiveGraph" has incompatible type "Union[str, Path]"; expected "Union[Node, str, None]"
c = ctx.pushGraph(ctx.dataset.get_context(graph)) # type: ignore[arg-type]
for x in evalPart(c, part.p):
x.ctx.graph = prev_graph
yield x
def evalValues(
ctx: QueryContext, part: CompValue
) -> Generator[FrozenBindings, None, None]:
for r in part.p.res:
c = ctx.push()
try:
for k, v in r.items():
if v != "UNDEF":
c[k] = v
except AlreadyBound:
continue
yield c.solution()
def evalMultiset(ctx: QueryContext, part: CompValue):
if part.p.name == "values":
return evalValues(ctx, part)
return evalPart(ctx, part.p)
def evalPart(ctx: QueryContext, part: CompValue) -> Any:
# try custom evaluation functions
for name, c in CUSTOM_EVALS.items():
try:
return c(ctx, part)
except NotImplementedError:
pass # the given custome-function did not handle this part
if part.name == "BGP":
# Reorder triples patterns by number of bound nodes in the current ctx
# Do patterns with more bound nodes first
triples = sorted(
part.triples, key=lambda t: len([n for n in t if ctx[n] is None])
)
return evalBGP(ctx, triples)
elif part.name == "Filter":
return evalFilter(ctx, part)
elif part.name == "Join":
return evalJoin(ctx, part)
elif part.name == "LeftJoin":
return evalLeftJoin(ctx, part)
elif part.name == "Graph":
return evalGraph(ctx, part)
elif part.name == "Union":
return evalUnion(ctx, part)
elif part.name == "ToMultiSet":
return evalMultiset(ctx, part)
elif part.name == "Extend":
return evalExtend(ctx, part)
elif part.name == "Minus":
return evalMinus(ctx, part)
elif part.name == "Project":
return evalProject(ctx, part)
elif part.name == "Slice":
return evalSlice(ctx, part)
elif part.name == "Distinct":
return evalDistinct(ctx, part)
elif part.name == "Reduced":
return evalReduced(ctx, part)
elif part.name == "OrderBy":
return evalOrderBy(ctx, part)
elif part.name == "Group":
return evalGroup(ctx, part)
elif part.name == "AggregateJoin":
return evalAggregateJoin(ctx, part)
elif part.name == "SelectQuery":
return evalSelectQuery(ctx, part)
elif part.name == "AskQuery":
return evalAskQuery(ctx, part)
elif part.name == "ConstructQuery":
return evalConstructQuery(ctx, part)
elif part.name == "ServiceGraphPattern":
return evalServiceQuery(ctx, part)
elif part.name == "DescribeQuery":
return evalDescribeQuery(ctx, part)
else:
raise Exception("I dont know: %s" % part.name)
def evalServiceQuery(ctx: QueryContext, part: CompValue):
res = {}
match = re.match(
"^service <(.*)>[ \n]*{(.*)}[ \n]*$",
# type error: Argument 2 to "get" of "CompValue" has incompatible type "str"; expected "bool" [arg-type]
part.get("service_string", ""), # type: ignore[arg-type]
re.DOTALL | re.I,
)
if match:
service_url = match.group(1)
service_query = _buildQueryStringForServiceCall(ctx, match.group(2))
query_settings = {"query": service_query, "output": "json"}
headers = {
"accept": "application/sparql-results+json",
"user-agent": "rdflibForAnUser",
}
# GET is easier to cache so prefer that if the query is not to long
if len(service_query) < 600:
response = urlopen(
Request(service_url + "?" + urlencode(query_settings), headers=headers)
)
else:
response = urlopen(
Request(
service_url,
data=urlencode(query_settings).encode(),
headers=headers,
)
)
if response.status == 200:
if _HAS_ORJSON:
json_dict = orjson.loads(response.read())
else:
json_dict = json.loads(response.read())
variables = res["vars_"] = json_dict["head"]["vars"]
# or just return the bindings?
res = json_dict["results"]["bindings"]
if len(res) > 0:
for r in res:
# type error: Argument 2 to "_yieldBindingsFromServiceCallResult" has incompatible type "str"; expected "Dict[str, Dict[str, str]]"
for bound in _yieldBindingsFromServiceCallResult(ctx, r, variables): # type: ignore[arg-type]
yield bound
else:
raise Exception(
"Service: %s responded with code: %s", service_url, response.status
)
"""
Build a query string to be used by the service call.
It is supposed to pass in the existing bound solutions.
Re-adds prefixes if added and sets the base.
Wraps it in select if needed.
"""
def _buildQueryStringForServiceCall(ctx: QueryContext, service_query: str) -> str:
try:
parser.parseQuery(service_query)
except ParseException:
# This could be because we don't have a select around the service call.
service_query = "SELECT REDUCED * WHERE {" + service_query + "}"
# type error: Item "None" of "Optional[Prologue]" has no attribute "namespace_manager"
for p in ctx.prologue.namespace_manager.store.namespaces(): # type: ignore[union-attr]
service_query = "PREFIX " + p[0] + ":" + p[1].n3() + " " + service_query
# re add the base if one was defined
# type error: Item "None" of "Optional[Prologue]" has no attribute "base"
base = ctx.prologue.base # type: ignore[union-attr]
if base is not None and len(base) > 0:
service_query = "BASE <" + base + "> " + service_query
sol = [v for v in ctx.solution() if isinstance(v, Variable)]
if len(sol) > 0:
variables = " ".join([v.n3() for v in sol])
variables_bound = " ".join([ctx.get(v).n3() for v in sol])
service_query = (
service_query + "VALUES (" + variables + ") {(" + variables_bound + ")}"
)
return service_query
def _yieldBindingsFromServiceCallResult(
ctx: QueryContext, r: Dict[str, Dict[str, str]], variables: List[str]
) -> Generator[FrozenBindings, None, None]:
res_dict: Dict[Variable, Identifier] = {}
for var in variables:
if var in r and r[var]:
var_binding = r[var]
var_type = var_binding["type"]
if var_type == "uri":
res_dict[Variable(var)] = URIRef(var_binding["value"])
elif var_type == "literal":
res_dict[Variable(var)] = Literal(
var_binding["value"],
datatype=var_binding.get("datatype"),
lang=var_binding.get("xml:lang"),
)
# This is here because of
# https://www.w3.org/TR/2006/NOTE-rdf-sparql-json-res-20061004/#variable-binding-results
elif var_type == "typed-literal":
res_dict[Variable(var)] = Literal(
var_binding["value"], datatype=URIRef(var_binding["datatype"])
)
elif var_type == "bnode":
res_dict[Variable(var)] = BNode(var_binding["value"])
else:
raise ValueError(f"invalid type {var_type!r} for variable {var!r}")
yield FrozenBindings(ctx, res_dict)
def evalGroup(ctx: QueryContext, group: CompValue):
"""
http://www.w3.org/TR/sparql11-query/#defn_algGroup
"""
# grouping should be implemented by evalAggregateJoin
return evalPart(ctx, group.p)
def evalAggregateJoin(
ctx: QueryContext, agg: CompValue
) -> Generator[FrozenBindings, None, None]:
# import pdb ; pdb.set_trace()
p = evalPart(ctx, agg.p)
# p is always a Group, we always get a dict back
group_expr = agg.p.expr
res: Dict[Any, Any] = collections.defaultdict(
lambda: Aggregator(aggregations=agg.A)
)
if group_expr is None:
# no grouping, just COUNT in SELECT clause
# get 1 aggregator for counting
aggregator = res[True]
for row in p:
aggregator.update(row)
else:
for row in p:
# determine right group aggregator for row
k = tuple(_eval(e, row, False) for e in group_expr)
res[k].update(row)
# all rows are done; yield aggregated values
for aggregator in res.values():
yield FrozenBindings(ctx, aggregator.get_bindings())
# there were no matches
if len(res) == 0:
yield FrozenBindings(ctx)
def evalOrderBy(
ctx: QueryContext, part: CompValue
) -> Generator[FrozenBindings, None, None]:
res = evalPart(ctx, part.p)
for e in reversed(part.expr):
reverse = bool(e.order and e.order == "DESC")
res = sorted(
res, key=lambda x: _val(value(x, e.expr, variables=True)), reverse=reverse
)
return res
def evalSlice(ctx: QueryContext, slice: CompValue):
res = evalPart(ctx, slice.p)
return itertools.islice(
res,
slice.start,
slice.start + slice.length if slice.length is not None else None,
)
def evalReduced(
ctx: QueryContext, part: CompValue
) -> Generator[FrozenBindings, None, None]:
"""apply REDUCED to result
REDUCED is not as strict as DISTINCT, but if the incoming rows were sorted
it should produce the same result with limited extra memory and time per
incoming row.
"""
# This implementation uses a most recently used strategy and a limited
# buffer size. It relates to a LRU caching algorithm:
# https://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used_.28LRU.29
MAX = 1
# TODO: add configuration or determine "best" size for most use cases
# 0: No reduction
# 1: compare only with the last row, almost no reduction with
# unordered incoming rows
# N: The greater the buffer size the greater the reduction but more
# memory and time are needed
# mixed data structure: set for lookup, deque for append/pop/remove
mru_set = set()
mru_queue: Deque[Any] = collections.deque()
for row in evalPart(ctx, part.p):
if row in mru_set:
# forget last position of row
mru_queue.remove(row)
else:
# row seems to be new
yield row
mru_set.add(row)
if len(mru_set) > MAX:
# drop the least recently used row from buffer
mru_set.remove(mru_queue.pop())
# put row to the front
mru_queue.appendleft(row)
def evalDistinct(
ctx: QueryContext, part: CompValue
) -> Generator[FrozenBindings, None, None]:
res = evalPart(ctx, part.p)
done = set()
for x in res:
if x not in done:
yield x
done.add(x)
def evalProject(ctx: QueryContext, project: CompValue):
res = evalPart(ctx, project.p)
return (row.project(project.PV) for row in res)
def evalSelectQuery(
ctx: QueryContext, query: CompValue
) -> Mapping[str, Union[str, List[Variable], Iterable[FrozenDict]]]:
res: Dict[str, Union[str, List[Variable], Iterable[FrozenDict]]] = {}
res["type_"] = "SELECT"
res["bindings"] = evalPart(ctx, query.p)
res["vars_"] = query.PV
return res
def evalAskQuery(ctx: QueryContext, query: CompValue) -> Mapping[str, Union[str, bool]]:
res: Dict[str, Union[bool, str]] = {}
res["type_"] = "ASK"
res["askAnswer"] = False
for x in evalPart(ctx, query.p):
res["askAnswer"] = True
break
return res
def evalConstructQuery(
ctx: QueryContext, query: CompValue
) -> Mapping[str, Union[str, Graph]]:
template = query.template
if not template:
# a construct-where query
template = query.p.p.triples # query->project->bgp ...
graph = Graph()
for c in evalPart(ctx, query.p):
graph += _fillTemplate(template, c)
res: Dict[str, Union[str, Graph]] = {}
res["type_"] = "CONSTRUCT"
res["graph"] = graph
return res
def evalDescribeQuery(ctx: QueryContext, query) -> Dict[str, Union[str, Graph]]:
# Create a result graph and bind namespaces from the graph being queried
graph = Graph()
# type error: Item "None" of "Optional[Graph]" has no attribute "namespaces"
for pfx, ns in ctx.graph.namespaces(): # type: ignore[union-attr]
graph.bind(pfx, ns)
to_describe = set()
# Explicit IRIs may be provided to a DESCRIBE query.
# If there is a WHERE clause, explicit IRIs may be provided in
# addition to projected variables. Find those explicit IRIs and
# prepare to describe them.
for iri in query.PV:
if isinstance(iri, URIRef):
to_describe.add(iri)
# If there is a WHERE clause, evaluate it then find the unique set of
# resources to describe across all bindings and projected variables
if query.p is not None:
bindings = evalPart(ctx, query.p)
to_describe.update(*(set(binding.values()) for binding in bindings))
# Get a CBD for all resources identified to describe
for resource in to_describe:
# type error: Item "None" of "Optional[Graph]" has no attribute "cbd"
ctx.graph.cbd(resource, target_graph=graph) # type: ignore[union-attr]
res: Dict[str, Union[str, Graph]] = {}
res["type_"] = "DESCRIBE"
res["graph"] = graph
return res
def evalQuery(
graph: Graph,
query: Query,
initBindings: Optional[Mapping[str, Identifier]] = None,
base: Optional[str] = None,
) -> Mapping[Any, Any]:
"""
.. caution::
This method can access indirectly requested network endpoints, for
example, query processing will attempt to access network endpoints
specified in ``SERVICE`` directives.
When processing untrusted or potentially malicious queries, measures
should be taken to restrict network and file access.
For information on available security measures, see the RDFLib
:doc:`Security Considerations </security_considerations>`
documentation.
"""
main = query.algebra
initBindings = dict((Variable(k), v) for k, v in (initBindings or {}).items())
ctx = QueryContext(
graph, initBindings=initBindings, datasetClause=main.datasetClause
)
ctx.prologue = query.prologue
return evalPart(ctx, main)
@@ -0,0 +1,188 @@
from __future__ import annotations
import collections
from typing import (
Any,
DefaultDict,
Generator,
Iterable,
Mapping,
Set,
Tuple,
TypeVar,
Union,
overload,
)
from rdflib.plugins.sparql.operators import EBV
from rdflib.plugins.sparql.parserutils import CompValue, Expr
from rdflib.plugins.sparql.sparql import (
FrozenBindings,
FrozenDict,
NotBoundError,
QueryContext,
SPARQLError,
)
from rdflib.term import BNode, Identifier, Literal, URIRef, Variable
_ContextType = Union[FrozenBindings, QueryContext]
_FrozenDictT = TypeVar("_FrozenDictT", bound=FrozenDict)
def _diff(
a: Iterable[_FrozenDictT], b: Iterable[_FrozenDictT], expr
) -> Set[_FrozenDictT]:
res = set()
for x in a:
if all(not x.compatible(y) or not _ebv(expr, x.merge(y)) for y in b):
res.add(x)
return res
def _minus(
a: Iterable[_FrozenDictT], b: Iterable[_FrozenDictT]
) -> Generator[_FrozenDictT, None, None]:
for x in a:
if all((not x.compatible(y)) or x.disjointDomain(y) for y in b):
yield x
@overload
def _join(
a: Iterable[FrozenBindings], b: Iterable[Mapping[Identifier, Identifier]]
) -> Generator[FrozenBindings, None, None]: ...
@overload
def _join(
a: Iterable[FrozenDict], b: Iterable[Mapping[Identifier, Identifier]]
) -> Generator[FrozenDict, None, None]: ...
def _join(
a: Iterable[FrozenDict], b: Iterable[Mapping[Identifier, Identifier]]
) -> Generator[FrozenDict, None, None]:
for x in a:
for y in b:
if x.compatible(y):
yield x.merge(y)
def _ebv(expr: Union[Literal, Variable, Expr], ctx: FrozenDict) -> bool:
"""
Return true/false for the given expr
Either the expr is itself true/false
or evaluates to something, with the given ctx
an error is false
"""
try:
return EBV(expr)
except SPARQLError:
pass
if isinstance(expr, Expr):
try:
return EBV(expr.eval(ctx))
except SPARQLError:
return False # filter error == False
# type error: Subclass of "Literal" and "CompValue" cannot exist: would have incompatible method signatures
elif isinstance(expr, CompValue): # type: ignore[unreachable]
raise Exception("Weird - filter got a CompValue without evalfn! %r" % expr)
elif isinstance(expr, Variable):
try:
return EBV(ctx[expr])
except: # noqa: E722
return False
return False
@overload
def _eval(
expr: Union[Literal, URIRef],
ctx: FrozenBindings,
raise_not_bound_error: bool = ...,
) -> Union[Literal, URIRef]: ...
@overload
def _eval(
expr: Union[Variable, Expr],
ctx: FrozenBindings,
raise_not_bound_error: bool = ...,
) -> Union[Any, SPARQLError]: ...
def _eval(
expr: Union[Literal, URIRef, Variable, Expr],
ctx: FrozenBindings,
raise_not_bound_error: bool = True,
) -> Any:
if isinstance(expr, (Literal, URIRef)):
return expr
if isinstance(expr, Expr):
return expr.eval(ctx)
elif isinstance(expr, Variable):
try:
return ctx[expr]
except KeyError:
if raise_not_bound_error:
raise NotBoundError("Variable %s is not bound" % expr)
else:
return None
elif isinstance(expr, CompValue): # type: ignore[unreachable]
raise Exception("Weird - _eval got a CompValue without evalfn! %r" % expr)
else:
raise Exception("Cannot eval thing: %s (%s)" % (expr, type(expr)))
def _filter(
a: Iterable[FrozenDict], expr: Union[Literal, Variable, Expr]
) -> Generator[FrozenDict, None, None]:
for c in a:
if _ebv(expr, c):
yield c
def _fillTemplate(
template: Iterable[Tuple[Identifier, Identifier, Identifier]],
solution: _ContextType,
) -> Generator[Tuple[Identifier, Identifier, Identifier], None, None]:
"""
For construct/deleteWhere and friends
Fill a triple template with instantiated variables
"""
bnodeMap: DefaultDict[BNode, BNode] = collections.defaultdict(BNode)
for t in template:
s, p, o = t
_s = solution.get(s)
_p = solution.get(p)
_o = solution.get(o)
# instantiate new bnodes for each solution
_s, _p, _o = [
bnodeMap[x] if isinstance(x, BNode) else y for x, y in zip(t, (_s, _p, _o))
]
if _s is not None and _p is not None and _o is not None:
yield (_s, _p, _o)
_ValueT = TypeVar("_ValueT", Variable, BNode, URIRef, Literal)
def _val(v: _ValueT) -> Tuple[int, _ValueT]:
"""utilitity for ordering things"""
if isinstance(v, Variable):
return (0, v)
elif isinstance(v, BNode):
return (1, v)
elif isinstance(v, URIRef):
return (2, v)
elif isinstance(v, Literal):
return (3, v)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,316 @@
"""
NOTE: PyParsing setResultName/__call__ provides a very similar solution to this
I didn't realise at the time of writing and I will remove a
lot of this code at some point
Utility classes for creating an abstract-syntax tree out with pyparsing actions
Lets you label and group parts of parser production rules
For example:
# [5] BaseDecl ::= 'BASE' IRIREF
BaseDecl = Comp('Base', Keyword('BASE') + Param('iri',IRIREF))
After parsing, this gives you back an CompValue object,
which is a dict/object with the parameters specified.
So you can access the parameters are attributes or as keys:
baseDecl.iri
Comp lets you set an evalFn that is bound to the eval method of
the resulting CompValue
"""
from __future__ import annotations
from collections import OrderedDict
from types import MethodType
from typing import (
TYPE_CHECKING,
Any,
Callable,
List,
Mapping,
Optional,
Tuple,
TypeVar,
Union,
)
from pyparsing import ParserElement, ParseResults, TokenConverter, originalTextFor
from rdflib.term import BNode, Identifier, Variable
if TYPE_CHECKING:
from rdflib.plugins.sparql.sparql import FrozenBindings
# This is an alternative
# Comp('Sum')( Param('x')(Number) + '+' + Param('y')(Number) )
def value(
ctx: FrozenBindings,
val: Any,
variables: bool = False,
errors: bool = False,
) -> Any:
"""
utility function for evaluating something...
Variables will be looked up in the context
Normally, non-bound vars is an error,
set variables=True to return unbound vars
Normally, an error raises the error,
set errors=True to return error
"""
if isinstance(val, Expr):
return val.eval(ctx) # recurse?
elif isinstance(val, CompValue):
raise Exception("What do I do with this CompValue? %s" % val)
elif isinstance(val, list):
return [value(ctx, x, variables, errors) for x in val]
elif isinstance(val, (BNode, Variable)):
r = ctx.get(val)
if isinstance(r, SPARQLError) and not errors:
raise r
if r is not None:
return r
# not bound
if variables:
return val
else:
raise NotBoundError
elif isinstance(val, ParseResults) and len(val) == 1:
return value(ctx, val[0], variables, errors)
else:
return val
class ParamValue:
"""
The result of parsing a Param
This just keeps the name/value
All cleverness is in the CompValue
"""
def __init__(
self, name: str, tokenList: Union[List[Any], ParseResults], isList: bool
):
self.isList = isList
self.name = name
if isinstance(tokenList, (list, ParseResults)) and len(tokenList) == 1:
tokenList = tokenList[0]
self.tokenList = tokenList
def __str__(self) -> str:
return "Param(%s, %s)" % (self.name, self.tokenList)
class Param(TokenConverter):
"""
A pyparsing token for labelling a part of the parse-tree
if isList is true repeat occurrences of ParamList have
their values merged in a list
"""
def __init__(self, name: str, expr, isList: bool = False):
self.isList = isList
TokenConverter.__init__(self, expr)
self.setName(name)
self.addParseAction(self.postParse2)
def postParse2(self, tokenList: Union[List[Any], ParseResults]) -> ParamValue:
return ParamValue(self.name, tokenList, self.isList)
class ParamList(Param):
"""
A shortcut for a Param with isList=True
"""
def __init__(self, name: str, expr):
Param.__init__(self, name, expr, True)
_ValT = TypeVar("_ValT")
class CompValue(OrderedDict):
"""
The result of parsing a Comp
Any included Params are available as Dict keys
or as attributes
"""
def __init__(self, name: str, **values):
OrderedDict.__init__(self)
self.name = name
self.update(values)
def clone(self) -> CompValue:
return CompValue(self.name, **self)
def __str__(self) -> str:
return self.name + "_" + OrderedDict.__str__(self)
def __repr__(self) -> str:
return self.name + "_" + dict.__repr__(self)
def _value(
self, val: _ValT, variables: bool = False, errors: bool = False
) -> Union[_ValT, Any]:
if self.ctx is not None:
return value(self.ctx, val, variables)
else:
return val
def __getitem__(self, a):
return self._value(OrderedDict.__getitem__(self, a))
# type error: Signature of "get" incompatible with supertype "dict"
# type error: Signature of "get" incompatible with supertype "Mapping" [override]
def get(self, a, variables: bool = False, errors: bool = False): # type: ignore[override]
return self._value(OrderedDict.get(self, a, a), variables, errors)
def __getattr__(self, a: str) -> Any:
# Hack hack: OrderedDict relies on this
if a in ("_OrderedDict__root", "_OrderedDict__end"):
raise AttributeError()
try:
return self[a]
except KeyError:
# raise AttributeError('no such attribute '+a)
return None
if TYPE_CHECKING:
# this is here because properties are dynamically set on CompValue
def __setattr__(self, __name: str, __value: Any) -> None: ...
class Expr(CompValue):
"""
A CompValue that is evaluatable
"""
def __init__(
self,
name: str,
evalfn: Optional[Callable[[Any, Any], Any]] = None,
**values,
):
super(Expr, self).__init__(name, **values)
self._evalfn = None
if evalfn:
self._evalfn = MethodType(evalfn, self)
def eval(self, ctx: Any = {}) -> Union[SPARQLError, Any]:
try:
self.ctx: Optional[Union[Mapping, FrozenBindings]] = ctx
# type error: "None" not callable
return self._evalfn(ctx) # type: ignore[misc]
except SPARQLError as e:
return e
finally:
self.ctx = None
class Comp(TokenConverter):
"""
A pyparsing token for grouping together things with a label
Any sub-tokens that are not Params will be ignored.
Returns CompValue / Expr objects - depending on whether evalFn is set.
"""
def __init__(self, name: str, expr: ParserElement):
self.expr = expr
TokenConverter.__init__(self, expr)
self.setName(name)
self.evalfn: Optional[Callable[[Any, Any], Any]] = None
def postParse(
self, instring: str, loc: int, tokenList: ParseResults
) -> Union[Expr, CompValue]:
res: Union[Expr, CompValue]
if self.evalfn:
res = Expr(self.name)
res._evalfn = MethodType(self.evalfn, res)
else:
res = CompValue(self.name)
if self.name == "ServiceGraphPattern":
# Then this must be a service graph pattern and have
# already matched.
# lets assume there is one, for now, then test for two later.
sgp = originalTextFor(self.expr)
service_string = sgp.searchString(instring)[0][0]
res["service_string"] = service_string
for t in tokenList:
if isinstance(t, ParamValue):
if t.isList:
if t.name not in res:
res[t.name] = []
res[t.name].append(t.tokenList)
else:
res[t.name] = t.tokenList
# res.append(t.tokenList)
# if isinstance(t,CompValue):
# res.update(t)
return res
def setEvalFn(self, evalfn: Callable[[Any, Any], Any]) -> Comp:
self.evalfn = evalfn
return self
def prettify_parsetree(t: ParseResults, indent: str = "", depth: int = 0) -> str:
out: List[str] = []
for e in t.asList():
out.append(_prettify_sub_parsetree(e, indent, depth + 1))
for k, v in sorted(t.items()):
out.append("%s%s- %s:\n" % (indent, " " * depth, k))
out.append(_prettify_sub_parsetree(v, indent, depth + 1))
return "".join(out)
def _prettify_sub_parsetree(
t: Union[Identifier, CompValue, set, list, dict, Tuple, bool, None],
indent: str = "",
depth: int = 0,
) -> str:
out: List[str] = []
if isinstance(t, CompValue):
out.append("%s%s> %s:\n" % (indent, " " * depth, t.name))
for k, v in t.items():
out.append("%s%s- %s:\n" % (indent, " " * (depth + 1), k))
out.append(_prettify_sub_parsetree(v, indent, depth + 2))
elif isinstance(t, dict):
for k, v in t.items():
out.append("%s%s- %s:\n" % (indent, " " * (depth + 1), k))
out.append(_prettify_sub_parsetree(v, indent, depth + 2))
elif isinstance(t, list):
for e in t:
out.append(_prettify_sub_parsetree(e, indent, depth + 1))
else:
out.append("%s%s- %r\n" % (indent, " " * depth, t))
return "".join(out)
# hurrah for circular imports
from rdflib.plugins.sparql.sparql import NotBoundError, SPARQLError # noqa: E402
@@ -0,0 +1,147 @@
"""
Code for tying SPARQL Engine into RDFLib
These should be automatically registered with RDFLib
"""
from __future__ import annotations
from typing import Any, Mapping, Optional, Union
from rdflib.graph import Graph
from rdflib.plugins.sparql.algebra import translateQuery, translateUpdate
from rdflib.plugins.sparql.evaluate import evalQuery
from rdflib.plugins.sparql.parser import parseQuery, parseUpdate
from rdflib.plugins.sparql.sparql import Query, Update
from rdflib.plugins.sparql.update import evalUpdate
from rdflib.query import Processor, Result, UpdateProcessor
from rdflib.term import Identifier
def prepareQuery(
queryString: str,
initNs: Optional[Mapping[str, Any]] = None,
base: Optional[str] = None,
) -> Query:
"""
Parse and translate a SPARQL Query
"""
if initNs is None:
initNs = {}
ret = translateQuery(parseQuery(queryString), base, initNs)
ret._original_args = (queryString, initNs, base)
return ret
def prepareUpdate(
updateString: str,
initNs: Optional[Mapping[str, Any]] = None,
base: Optional[str] = None,
) -> Update:
"""
Parse and translate a SPARQL Update
"""
if initNs is None:
initNs = {}
ret = translateUpdate(parseUpdate(updateString), base, initNs)
ret._original_args = (updateString, initNs, base)
return ret
def processUpdate(
graph: Graph,
updateString: str,
initBindings: Optional[Mapping[str, Identifier]] = None,
initNs: Optional[Mapping[str, Any]] = None,
base: Optional[str] = None,
) -> None:
"""
Process a SPARQL Update Request
returns Nothing on success or raises Exceptions on error
"""
evalUpdate(
graph, translateUpdate(parseUpdate(updateString), base, initNs), initBindings
)
class SPARQLResult(Result):
def __init__(self, res: Mapping[str, Any]):
Result.__init__(self, res["type_"])
self.vars = res.get("vars_")
# type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "MutableSequence[Mapping[Variable, Identifier]]")
self.bindings = res.get("bindings") # type: ignore[assignment]
self.askAnswer = res.get("askAnswer")
self.graph = res.get("graph")
class SPARQLUpdateProcessor(UpdateProcessor):
def __init__(self, graph):
self.graph = graph
def update(
self,
strOrQuery: Union[str, Update],
initBindings: Optional[Mapping[str, Identifier]] = None,
initNs: Optional[Mapping[str, Any]] = None,
) -> None:
"""
.. caution::
This method can access indirectly requested network endpoints, for
example, query processing will attempt to access network endpoints
specified in ``SERVICE`` directives.
When processing untrusted or potentially malicious queries, measures
should be taken to restrict network and file access.
For information on available security measures, see the RDFLib
:doc:`Security Considerations </security_considerations>`
documentation.
"""
if isinstance(strOrQuery, str):
strOrQuery = translateUpdate(parseUpdate(strOrQuery), initNs=initNs)
return evalUpdate(self.graph, strOrQuery, initBindings)
class SPARQLProcessor(Processor):
def __init__(self, graph):
self.graph = graph
# NOTE on type error: this is because the super type constructor does not
# accept base argument and thie position of the DEBUG argument is
# different.
# type error: Signature of "query" incompatible with supertype "Processor"
def query( # type: ignore[override]
self,
strOrQuery: Union[str, Query],
initBindings: Optional[Mapping[str, Identifier]] = None,
initNs: Optional[Mapping[str, Any]] = None,
base: Optional[str] = None,
DEBUG: bool = False,
) -> Mapping[str, Any]:
"""
Evaluate a query with the given initial bindings, and initial
namespaces. The given base is used to resolve relative URIs in
the query and will be overridden by any BASE given in the query.
.. caution::
This method can access indirectly requested network endpoints, for
example, query processing will attempt to access network endpoints
specified in ``SERVICE`` directives.
When processing untrusted or potentially malicious queries, measures
should be taken to restrict network and file access.
For information on available security measures, see the RDFLib
:doc:`Security Considerations </security_considerations>`
documentation.
"""
if isinstance(strOrQuery, str):
strOrQuery = translateQuery(parseQuery(strOrQuery), base, initNs)
return evalQuery(self.graph, strOrQuery, initBindings, base)
@@ -0,0 +1,3 @@
"""
Parsers and serializers for SPARQL Result formats
"""
@@ -0,0 +1,104 @@
"""
This module implements a parser and serializer for the CSV SPARQL result
formats
http://www.w3.org/TR/sparql11-results-csv-tsv/
"""
from __future__ import annotations
import codecs
import csv
from io import BufferedIOBase, TextIOBase
from typing import IO, Dict, List, Optional, Union, cast
from rdflib.plugins.sparql.processor import SPARQLResult
from rdflib.query import Result, ResultParser, ResultSerializer
from rdflib.term import BNode, Identifier, Literal, URIRef, Variable
class CSVResultParser(ResultParser):
def __init__(self):
self.delim = ","
# type error: Signature of "parse" incompatible with supertype "ResultParser"
def parse(self, source: IO, content_type: Optional[str] = None) -> Result: # type: ignore[override]
r = Result("SELECT")
# type error: Incompatible types in assignment (expression has type "StreamReader", variable has type "IO[Any]")
if isinstance(source.read(0), bytes):
# if reading from source returns bytes do utf-8 decoding
# type error: Incompatible types in assignment (expression has type "StreamReader", variable has type "IO[Any]")
source = codecs.getreader("utf-8")(source) # type: ignore[assignment]
reader = csv.reader(source, delimiter=self.delim)
r.vars = [Variable(x) for x in next(reader)]
r.bindings = []
for row in reader:
r.bindings.append(self.parseRow(row, r.vars))
return r
def parseRow(
self, row: List[str], v: List[Variable]
) -> Dict[Variable, Union[BNode, URIRef, Literal]]:
return dict(
(var, val)
for var, val in zip(v, [self.convertTerm(t) for t in row])
if val is not None
)
def convertTerm(self, t: str) -> Optional[Union[BNode, URIRef, Literal]]:
if t == "":
return None
if t.startswith("_:"):
return BNode(t) # or generate new IDs?
if t.startswith("http://") or t.startswith("https://"): # TODO: more?
return URIRef(t)
return Literal(t)
class CSVResultSerializer(ResultSerializer):
def __init__(self, result: SPARQLResult):
ResultSerializer.__init__(self, result)
self.delim = ","
if result.type != "SELECT":
raise Exception("CSVSerializer can only serialize select query results")
def serialize(self, stream: IO, encoding: str = "utf-8", **kwargs) -> None:
# the serialiser writes bytes in the given encoding
# in py3 csv.writer is unicode aware and writes STRINGS,
# so we encode afterward
import codecs
# TODO: Find a better solution for all this casting
writable_stream = cast(Union[TextIOBase, BufferedIOBase], stream)
if isinstance(writable_stream, TextIOBase):
string_stream: TextIOBase = writable_stream
else:
byte_stream = cast(BufferedIOBase, writable_stream)
string_stream = cast(TextIOBase, codecs.getwriter(encoding)(byte_stream))
out = csv.writer(string_stream, delimiter=self.delim)
vs = [self.serializeTerm(v, encoding) for v in self.result.vars] # type: ignore[union-attr]
out.writerow(vs)
for row in self.result.bindings:
out.writerow(
[self.serializeTerm(row.get(v), encoding) for v in self.result.vars] # type: ignore[union-attr]
)
def serializeTerm(
self, term: Optional[Identifier], encoding: str
) -> Union[str, Identifier]:
if term is None:
return ""
elif isinstance(term, BNode):
return f"_:{term}"
else:
return term
@@ -0,0 +1,16 @@
from __future__ import annotations
from typing import IO, Optional
from rdflib.graph import Graph
from rdflib.query import Result, ResultParser
class GraphResultParser(ResultParser):
# type error: Signature of "parse" incompatible with supertype "ResultParser"
def parse(self, source: IO, content_type: Optional[str]) -> Result: # type: ignore[override]
res = Result("CONSTRUCT") # hmm - or describe?type_)
res.graph = Graph()
res.graph.parse(source, format=content_type)
return res
@@ -0,0 +1,164 @@
"""A Serializer for SPARQL results in JSON:
http://www.w3.org/TR/rdf-sparql-json-res/
Bits and pieces borrowed from:
http://projects.bigasterisk.com/sparqlhttp/
Authors: Drew Perttula, Gunnar Aastrand Grimnes
"""
from __future__ import annotations
import json
from typing import IO, Any, Dict, Mapping, MutableSequence, Optional
from rdflib.query import Result, ResultException, ResultParser, ResultSerializer
from rdflib.term import BNode, Identifier, Literal, URIRef, Variable
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
class JSONResultParser(ResultParser):
# type error: Signature of "parse" incompatible with supertype "ResultParser"
def parse(self, source: IO, content_type: Optional[str] = None) -> Result: # type: ignore[override]
inp = source.read()
if _HAS_ORJSON:
try:
loaded = orjson.loads(inp)
except Exception as e:
raise ResultException(f"Failed to parse result: {e}")
else:
if isinstance(inp, bytes):
inp = inp.decode("utf-8")
loaded = json.loads(inp)
return JSONResult(loaded)
class JSONResultSerializer(ResultSerializer):
def __init__(self, result: Result):
ResultSerializer.__init__(self, result)
# type error: Signature of "serialize" incompatible with supertype "ResultSerializer"
def serialize(self, stream: IO, encoding: str = None) -> None: # type: ignore[override]
res: Dict[str, Any] = {}
if self.result.type == "ASK":
res["head"] = {}
res["boolean"] = self.result.askAnswer
else:
# select
res["results"] = {}
res["head"] = {}
res["head"]["vars"] = self.result.vars
res["results"]["bindings"] = [
self._bindingToJSON(x) for x in self.result.bindings
]
if _HAS_ORJSON:
try:
r_bytes = orjson.dumps(res, option=orjson.OPT_NON_STR_KEYS)
except Exception as e:
raise ResultException(f"Failed to serialize result: {e}")
if encoding is not None:
# Note, orjson will always write utf-8 even if
# encoding is specified as something else.
try:
stream.write(r_bytes)
except (TypeError, ValueError):
stream.write(r_bytes.decode("utf-8"))
else:
stream.write(r_bytes.decode("utf-8"))
else:
r_str = json.dumps(res, allow_nan=False, ensure_ascii=False)
if encoding is not None:
try:
stream.write(r_str.encode(encoding))
except (TypeError, ValueError):
stream.write(r_str)
else:
stream.write(r_str)
def _bindingToJSON(self, b: Mapping[Variable, Identifier]) -> Dict[Variable, Any]:
res = {}
for var in b:
j = termToJSON(self, b[var])
if j is not None:
res[var] = termToJSON(self, b[var])
return res
class JSONResult(Result):
def __init__(self, json: Dict[str, Any]):
self.json = json
if "boolean" in json:
type_ = "ASK"
elif "results" in json:
type_ = "SELECT"
else:
raise ResultException("No boolean or results in json!")
Result.__init__(self, type_)
if type_ == "ASK":
self.askAnswer = bool(json["boolean"])
else:
self.bindings = self._get_bindings()
self.vars = [Variable(x) for x in json["head"]["vars"]]
def _get_bindings(self) -> MutableSequence[Mapping[Variable, Identifier]]:
ret: MutableSequence[Mapping[Variable, Identifier]] = []
for row in self.json["results"]["bindings"]:
outRow: Dict[Variable, Identifier] = {}
for k, v in row.items():
outRow[Variable(k)] = parseJsonTerm(v)
ret.append(outRow)
return ret
def parseJsonTerm(d: Dict[str, str]) -> Identifier:
"""rdflib object (Literal, URIRef, BNode) for the given json-format dict.
input is like:
{ 'type': 'uri', 'value': 'http://famegame.com/2006/01/username' }
{ 'type': 'literal', 'value': 'drewp' }
"""
t = d["type"]
if t == "uri":
return URIRef(d["value"])
elif t == "literal":
return Literal(d["value"], datatype=d.get("datatype"), lang=d.get("xml:lang"))
elif t == "typed-literal":
return Literal(d["value"], datatype=URIRef(d["datatype"]))
elif t == "bnode":
return BNode(d["value"])
else:
raise NotImplementedError("json term type %r" % t)
def termToJSON(
self: JSONResultSerializer, term: Optional[Identifier]
) -> Optional[Dict[str, str]]:
if isinstance(term, URIRef):
return {"type": "uri", "value": str(term)}
elif isinstance(term, Literal):
r = {"type": "literal", "value": str(term)}
if term.datatype is not None:
r["datatype"] = str(term.datatype)
if term.language is not None:
r["xml:lang"] = term.language
return r
elif isinstance(term, BNode):
return {"type": "bnode", "value": str(term)}
elif term is None:
return None
else:
raise ResultException("Unknown term type: %s (%s)" % (term, type(term)))
@@ -0,0 +1,70 @@
from __future__ import annotations
from typing import IO, Any, MutableMapping, Optional, Union
from rdflib.graph import Graph
from rdflib.namespace import RDF, Namespace
from rdflib.query import Result, ResultParser
from rdflib.term import Node, Variable
RS = Namespace("http://www.w3.org/2001/sw/DataAccess/tests/result-set#")
class RDFResultParser(ResultParser):
def parse(self, source: Union[IO, Graph], **kwargs: Any) -> Result:
return RDFResult(source, **kwargs)
class RDFResult(Result):
def __init__(self, source: Union[IO, Graph], **kwargs: Any):
if not isinstance(source, Graph):
graph = Graph()
graph.parse(source, **kwargs)
else:
graph = source
rs = graph.value(predicate=RDF.type, object=RS.ResultSet)
# there better be only one :)
if rs is None:
type_ = "CONSTRUCT"
# use a new graph
g = Graph()
g += graph
else:
askAnswer = graph.value(rs, RS.boolean)
if askAnswer is not None:
type_ = "ASK"
else:
type_ = "SELECT"
Result.__init__(self, type_)
if type_ == "SELECT":
# type error: Argument 1 to "Variable" has incompatible type "Node"; expected "str"
self.vars = [Variable(v) for v in graph.objects(rs, RS.resultVariable)] # type: ignore[arg-type]
self.bindings = []
for s in graph.objects(rs, RS.solution):
sol: MutableMapping[Variable, Optional[Node]] = {}
for b in graph.objects(s, RS.binding):
# type error: Argument 1 to "Variable" has incompatible type "Optional[Node]"; expected "str"
sol[Variable(graph.value(b, RS.variable))] = graph.value( # type: ignore[arg-type]
b, RS.value
)
# error: Argument 1 to "append" of "list" has incompatible type "MutableMapping[Variable, Optional[Node]]"; expected "Mapping[Variable, Identifier]"
self.bindings.append(sol) # type: ignore[arg-type]
elif type_ == "ASK":
# type error: Item "Node" of "Optional[Node]" has no attribute "value"
# type error: Item "None" of "Optional[Node]" has no attribute "value"
self.askAnswer = askAnswer.value # type: ignore[union-attr]
# type error: Item "Node" of "Optional[Node]" has no attribute "value"
# type error: Item "None" of "Optional[Node]" has no attribute "value"
if askAnswer.value is None: # type: ignore[union-attr]
raise Exception("Malformed boolean in ask answer!")
elif type_ == "CONSTRUCT":
self.graph = g
@@ -0,0 +1,105 @@
"""
This implements the Tab Separated SPARQL Result Format
It is implemented with pyparsing, reusing the elements from the SPARQL Parser
"""
from __future__ import annotations
import codecs
import typing
from typing import IO, Union
from pyparsing import (
FollowedBy,
LineEnd,
Literal,
Optional,
ParserElement,
Suppress,
ZeroOrMore,
)
from rdflib.plugins.sparql.parser import (
BLANK_NODE_LABEL,
IRIREF,
LANGTAG,
STRING_LITERAL1,
STRING_LITERAL2,
BooleanLiteral,
NumericLiteral,
Var,
)
from rdflib.plugins.sparql.parserutils import Comp, CompValue, Param
from rdflib.query import Result, ResultParser
from rdflib.term import BNode, URIRef
from rdflib.term import Literal as RDFLiteral
ParserElement.setDefaultWhitespaceChars(" \n")
String = STRING_LITERAL1 | STRING_LITERAL2
RDFLITERAL = Comp(
"literal",
Param("string", String)
+ Optional(
Param("lang", LANGTAG.leaveWhitespace())
| Literal("^^").leaveWhitespace() + Param("datatype", IRIREF).leaveWhitespace()
),
)
NONE_VALUE = object()
EMPTY = FollowedBy(LineEnd()) | FollowedBy("\t")
EMPTY.setParseAction(lambda x: NONE_VALUE)
TERM = RDFLITERAL | IRIREF | BLANK_NODE_LABEL | NumericLiteral | BooleanLiteral
ROW = (EMPTY | TERM) + ZeroOrMore(Suppress("\t") + (EMPTY | TERM))
ROW.parseWithTabs()
HEADER = Var + ZeroOrMore(Suppress("\t") + Var)
HEADER.parseWithTabs()
class TSVResultParser(ResultParser):
# type error: Signature of "parse" incompatible with supertype "ResultParser" [override]
def parse(self, source: IO, content_type: typing.Optional[str] = None) -> Result: # type: ignore[override]
if isinstance(source.read(0), bytes):
# if reading from source returns bytes do utf-8 decoding
# type error: Incompatible types in assignment (expression has type "StreamReader", variable has type "IO[Any]")
source = codecs.getreader("utf-8")(source) # type: ignore[assignment]
r = Result("SELECT")
header = source.readline()
r.vars = list(HEADER.parseString(header.strip(), parseAll=True))
r.bindings = []
while True:
line = source.readline()
if not line:
break
line = line.strip("\n")
if line == "":
continue
row = ROW.parseString(line, parseAll=True)
# type error: Generator has incompatible item type "object"; expected "Identifier"
r.bindings.append(dict(zip(r.vars, (self.convertTerm(x) for x in row)))) # type: ignore[misc]
return r
def convertTerm(
self, t: Union[object, RDFLiteral, BNode, CompValue, URIRef]
) -> typing.Optional[Union[object, BNode, URIRef, RDFLiteral]]:
if t is NONE_VALUE:
return None
if isinstance(t, CompValue):
if t.name == "literal":
return RDFLiteral(t.string, lang=t.lang, datatype=t.datatype)
else:
raise Exception("I dont know how to handle this: %s" % (t,))
else:
return t
@@ -0,0 +1,86 @@
from __future__ import annotations
from io import StringIO
from typing import IO, List, Optional, Union
from rdflib.namespace import NamespaceManager
from rdflib.query import ResultSerializer
from rdflib.term import BNode, Literal, URIRef, Variable
def _termString(
t: Optional[Union[URIRef, Literal, BNode]],
namespace_manager: Optional[NamespaceManager],
) -> str:
if t is None:
return "-"
if namespace_manager:
if isinstance(t, URIRef):
return namespace_manager.normalizeUri(t)
elif isinstance(t, BNode):
return t.n3()
elif isinstance(t, Literal):
return t._literal_n3(qname_callback=namespace_manager.normalizeUri)
else:
return t.n3()
class TXTResultSerializer(ResultSerializer):
"""
A write-only QueryResult serializer for text/ascii tables
"""
def serialize(
self,
stream: IO,
encoding: str = "utf-8",
*,
namespace_manager: Optional[NamespaceManager] = None,
**kwargs,
) -> None:
"""
return a text table of query results
"""
def c(s, w):
"""
center the string s in w wide string
"""
w -= len(s)
h1 = h2 = w // 2
if w % 2:
h2 += 1
return " " * h1 + s + " " * h2
if self.result.type != "SELECT":
raise Exception("Can only pretty print SELECT results!")
string_stream = StringIO()
if not self.result:
string_stream.write("(no results)\n")
else:
keys: List[Variable] = self.result.vars # type: ignore[assignment]
maxlen = [0] * len(keys)
b = [
# type error: Value of type "Union[Tuple[Node, Node, Node], bool, ResultRow]" is not indexable
# type error: Argument 1 to "_termString" has incompatible type "Union[Node, Any]"; expected "Union[URIRef, Literal, BNode, None]" [arg-type]
# type error: No overload variant of "__getitem__" of "tuple" matches argument type "Variable"
# NOTE on type error: The problem here is that r can be more types than _termString expects because result can be a result of multiple types.
[_termString(r[k], namespace_manager) for k in keys] # type: ignore[index, arg-type, call-overload]
for r in self.result
]
for r in b:
for i in range(len(keys)):
maxlen[i] = max(maxlen[i], len(r[i]))
string_stream.write(
"|".join([c(k, maxlen[i]) for i, k in enumerate(keys)]) + "\n"
)
string_stream.write("-" * (len(maxlen) + sum(maxlen)) + "\n")
for r in sorted(b):
string_stream.write(
"|".join([t + " " * (i - len(t)) for i, t in zip(maxlen, r)]) + "\n"
)
text_val = string_stream.getvalue()
try:
stream.write(text_val.encode(encoding))
except (TypeError, ValueError):
stream.write(text_val)
@@ -0,0 +1,301 @@
"""A Parser for SPARQL results in XML:
http://www.w3.org/TR/rdf-sparql-XMLres/
Bits and pieces borrowed from:
http://projects.bigasterisk.com/sparqlhttp/
Authors: Drew Perttula, Gunnar Aastrand Grimnes
"""
from __future__ import annotations
import logging
import xml.etree.ElementTree as xml_etree # noqa: N813
from io import BytesIO
from typing import (
IO,
TYPE_CHECKING,
Any,
BinaryIO,
Dict,
Optional,
Sequence,
TextIO,
Tuple,
Union,
cast,
)
from xml.dom import XML_NAMESPACE
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesNSImpl
from rdflib.query import Result, ResultException, ResultParser, ResultSerializer
from rdflib.term import BNode, Identifier, Literal, URIRef, Variable
try:
# https://adamj.eu/tech/2021/12/29/python-type-hints-optional-imports/
import lxml.etree as lxml_etree
FOUND_LXML = True
except ImportError:
FOUND_LXML = False
SPARQL_XML_NAMESPACE = "http://www.w3.org/2005/sparql-results#"
RESULTS_NS_ET = "{%s}" % SPARQL_XML_NAMESPACE
log = logging.getLogger(__name__)
class XMLResultParser(ResultParser):
# TODO FIXME: content_type should be a keyword only arg.
def parse(self, source: IO, content_type: Optional[str] = None) -> Result: # type: ignore[override]
return XMLResult(source)
class XMLResult(Result):
def __init__(self, source: IO, content_type: Optional[str] = None):
parser_encoding: Optional[str] = None
if hasattr(source, "encoding"):
if TYPE_CHECKING:
assert isinstance(source, TextIO)
parser_encoding = "utf-8"
source_str = source.read()
source = BytesIO(source_str.encode(parser_encoding))
else:
if TYPE_CHECKING:
assert isinstance(source, BinaryIO)
if FOUND_LXML:
lxml_parser = lxml_etree.XMLParser(huge_tree=True, encoding=parser_encoding)
tree = cast(
xml_etree.ElementTree,
lxml_etree.parse(source, parser=lxml_parser),
)
else:
xml_parser = xml_etree.XMLParser(encoding=parser_encoding)
tree = xml_etree.parse(source, parser=xml_parser)
boolean = tree.find(RESULTS_NS_ET + "boolean")
results = tree.find(RESULTS_NS_ET + "results")
if boolean is not None:
type_ = "ASK"
elif results is not None:
type_ = "SELECT"
else:
raise ResultException("No RDF result-bindings or boolean answer found!")
Result.__init__(self, type_)
if type_ == "SELECT":
self.bindings = []
for result in results: # type: ignore[union-attr]
if result.tag != f"{RESULTS_NS_ET}result":
# This is here because with lxml this also gets comments,
# not just elements. Also this should not operate on non
# "result" elements.
continue
r = {}
for binding in result:
if binding.tag != f"{RESULTS_NS_ET}binding":
# This is here because with lxml this also gets
# comments, not just elements. Also this should not
# operate on non "binding" elements.
continue
# type error: error: Argument 1 to "Variable" has incompatible type "Union[str, None, Any]"; expected "str"
# NOTE on type error: Element.get() can return None, and
# this will invariably fail if passed into Variable
# constructor as value
r[Variable(binding.get("name"))] = parseTerm(binding[0]) # type: ignore[arg-type] # FIXME
self.bindings.append(r)
self.vars = [
# type error: Argument 1 to "Variable" has incompatible type "Optional[str]"; expected "str"
# NOTE on type error: Element.get() can return None, and this
# will invariably fail if passed into Variable constructor as
# value
Variable(x.get("name")) # type: ignore[arg-type] # FIXME
for x in tree.findall(
"./%shead/%svariable" % (RESULTS_NS_ET, RESULTS_NS_ET)
)
]
else:
self.askAnswer = boolean.text.lower().strip() == "true" # type: ignore[union-attr]
def parseTerm(element: xml_etree.Element) -> Union[URIRef, Literal, BNode]:
"""rdflib object (Literal, URIRef, BNode) for the given
elementtree element"""
tag, text = element.tag, element.text
if tag == RESULTS_NS_ET + "literal":
if text is None:
text = ""
datatype = None
lang = None
if element.get("datatype", None):
# type error: Argument 1 to "URIRef" has incompatible type "Optional[str]"; expected "str"
datatype = URIRef(element.get("datatype")) # type: ignore[arg-type]
elif element.get("{%s}lang" % XML_NAMESPACE, None):
lang = element.get("{%s}lang" % XML_NAMESPACE)
ret = Literal(text, datatype=datatype, lang=lang)
return ret
elif tag == RESULTS_NS_ET + "uri":
# type error: Argument 1 to "URIRef" has incompatible type "Optional[str]"; expected "str"
return URIRef(text) # type: ignore[arg-type]
elif tag == RESULTS_NS_ET + "bnode":
return BNode(text)
else:
raise TypeError("unknown binding type %r" % element)
class XMLResultSerializer(ResultSerializer):
def __init__(self, result: Result):
ResultSerializer.__init__(self, result)
def serialize(self, stream: IO, encoding: str = "utf-8", **kwargs: Any) -> None:
writer = SPARQLXMLWriter(stream, encoding)
if self.result.type == "ASK":
writer.write_header([])
# type error: Argument 1 to "write_ask" of "SPARQLXMLWriter" has incompatible type "Optional[bool]"; expected "bool"
writer.write_ask(self.result.askAnswer) # type: ignore[arg-type]
else:
# type error: Argument 1 to "write_header" of "SPARQLXMLWriter" has incompatible type "Optional[List[Variable]]"; expected "Sequence[Variable]"
writer.write_header(self.result.vars) # type: ignore[arg-type]
writer.write_results_header()
for b in self.result.bindings:
writer.write_start_result()
for key, val in b.items():
writer.write_binding(key, val)
writer.write_end_result()
writer.close()
# TODO: Rewrite with ElementTree?
class SPARQLXMLWriter:
"""
Python saxutils-based SPARQL XML Writer
"""
def __init__(self, output: IO, encoding: str = "utf-8"):
writer = XMLGenerator(output, encoding)
writer.startDocument()
writer.startPrefixMapping("", SPARQL_XML_NAMESPACE)
writer.startPrefixMapping("xml", XML_NAMESPACE)
writer.startElementNS(
(SPARQL_XML_NAMESPACE, "sparql"), "sparql", AttributesNSImpl({}, {})
)
self.writer = writer
self._output = output
self._encoding = encoding
self._results = False
def write_header(self, allvarsL: Sequence[Variable]) -> None:
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "head"), "head", AttributesNSImpl({}, {})
)
for i in range(0, len(allvarsL)):
attr_vals = {
(None, "name"): str(allvarsL[i]),
}
attr_qnames = {
(None, "name"): "name",
}
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "variable"),
"variable",
# type error: Argument 1 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]"
# type error: Argument 2 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]" [arg-type]
AttributesNSImpl(attr_vals, attr_qnames), # type: ignore[arg-type]
)
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "variable"), "variable")
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "head"), "head")
def write_ask(self, val: bool) -> None:
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "boolean"), "boolean", AttributesNSImpl({}, {})
)
self.writer.characters(str(val).lower())
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "boolean"), "boolean")
def write_results_header(self) -> None:
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "results"), "results", AttributesNSImpl({}, {})
)
self._results = True
def write_start_result(self) -> None:
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "result"), "result", AttributesNSImpl({}, {})
)
self._resultStarted = True
def write_end_result(self) -> None:
assert self._resultStarted
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "result"), "result")
self._resultStarted = False
def write_binding(self, name: Variable, val: Identifier) -> None:
assert self._resultStarted
attr_vals: Dict[Tuple[Optional[str], str], str] = {
(None, "name"): str(name),
}
attr_qnames: Dict[Tuple[Optional[str], str], str] = {
(None, "name"): "name",
}
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "binding"),
"binding",
# type error: Argument 1 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]"
# type error: Argument 2 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]"
AttributesNSImpl(attr_vals, attr_qnames), # type: ignore[arg-type, unused-ignore]
)
if isinstance(val, URIRef):
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "uri"), "uri", AttributesNSImpl({}, {})
)
self.writer.characters(val)
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "uri"), "uri")
elif isinstance(val, BNode):
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "bnode"), "bnode", AttributesNSImpl({}, {})
)
self.writer.characters(val)
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "bnode"), "bnode")
elif isinstance(val, Literal):
attr_vals = {}
attr_qnames = {}
if val.language:
attr_vals[(XML_NAMESPACE, "lang")] = val.language
attr_qnames[(XML_NAMESPACE, "lang")] = "xml:lang"
elif val.datatype:
attr_vals[(None, "datatype")] = val.datatype
attr_qnames[(None, "datatype")] = "datatype"
self.writer.startElementNS(
(SPARQL_XML_NAMESPACE, "literal"),
"literal",
# type error: Argument 1 to "AttributesNSImpl" has incompatible type "Dict[Tuple[Optional[str], str], str]"; expected "Mapping[Tuple[str, str], str]"
# type error: Argument 2 to "AttributesNSImpl" has incompatible type "Dict[Tuple[Optional[str], str], str]"; expected "Mapping[Tuple[str, str], str]"
AttributesNSImpl(attr_vals, attr_qnames), # type: ignore[arg-type, unused-ignore]
)
self.writer.characters(val)
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "literal"), "literal")
else:
raise Exception("Unsupported RDF term: %s" % val)
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "binding"), "binding")
def close(self) -> None:
if self._results:
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "results"), "results")
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "sparql"), "sparql")
self.writer.endDocument()
@@ -0,0 +1,499 @@
from __future__ import annotations
import collections
import datetime
import itertools
import typing as t
from collections.abc import Mapping, MutableMapping
from typing import (
TYPE_CHECKING,
Any,
Container,
Dict,
Generator,
Iterable,
List,
Optional,
Tuple,
TypeVar,
Union,
)
import rdflib.plugins.sparql
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
from rdflib.namespace import NamespaceManager
from rdflib.plugins.sparql.parserutils import CompValue
from rdflib.term import BNode, Identifier, Literal, Node, URIRef, Variable
if TYPE_CHECKING:
from rdflib.paths import Path
_AnyT = TypeVar("_AnyT")
class SPARQLError(Exception):
def __init__(self, msg: Optional[str] = None):
Exception.__init__(self, msg)
class NotBoundError(SPARQLError):
def __init__(self, msg: Optional[str] = None):
SPARQLError.__init__(self, msg)
class AlreadyBound(SPARQLError): # noqa: N818
"""Raised when trying to bind a variable that is already bound!"""
def __init__(self):
SPARQLError.__init__(self)
class SPARQLTypeError(SPARQLError):
def __init__(self, msg: Optional[str]):
SPARQLError.__init__(self, msg)
class Bindings(MutableMapping):
"""
A single level of a stack of variable-value bindings.
Each dict keeps a reference to the dict below it,
any failed lookup is propegated back
In python 3.3 this could be a collections.ChainMap
"""
def __init__(self, outer: Optional[Bindings] = None, d=[]):
self._d: Dict[str, str] = dict(d)
self.outer = outer
def __getitem__(self, key: str) -> str:
if key in self._d:
return self._d[key]
if not self.outer:
raise KeyError()
return self.outer[key]
def __contains__(self, key: Any) -> bool:
try:
self[key]
return True
except KeyError:
return False
def __setitem__(self, key: str, value: Any) -> None:
self._d[key] = value
def __delitem__(self, key: str) -> None:
raise Exception("DelItem is not implemented!")
def __len__(self) -> int:
i = 0
d: Optional[Bindings] = self
while d is not None:
i += len(d._d)
d = d.outer
return i
def __iter__(self) -> Generator[str, None, None]:
d: Optional[Bindings] = self
while d is not None:
yield from d._d
d = d.outer
def __str__(self) -> str:
# type error: Generator has incompatible item type "Tuple[Any, str]"; expected "str"
return "Bindings({" + ", ".join((k, self[k]) for k in self) + "})" # type: ignore[misc]
def __repr__(self) -> str:
return str(self)
class FrozenDict(Mapping):
"""
An immutable hashable dict
Taken from http://stackoverflow.com/a/2704866/81121
"""
def __init__(self, *args: Any, **kwargs: Any):
self._d: Dict[Identifier, Identifier] = dict(*args, **kwargs)
self._hash: Optional[int] = None
def __iter__(self):
return iter(self._d)
def __len__(self) -> int:
return len(self._d)
def __getitem__(self, key: Identifier) -> Identifier:
return self._d[key]
def __hash__(self) -> int:
# It would have been simpler and maybe more obvious to
# use hash(tuple(sorted(self._d.items()))) from this discussion
# so far, but this solution is O(n). I don't know what kind of
# n we are going to run into, but sometimes it's hard to resist the
# urge to optimize when it will gain improved algorithmic performance.
if self._hash is None:
self._hash = 0
for key, value in self.items():
self._hash ^= hash(key)
self._hash ^= hash(value)
return self._hash
def project(self, vars: Container[Variable]) -> FrozenDict:
return FrozenDict(x for x in self.items() if x[0] in vars)
def disjointDomain(self, other: t.Mapping[Identifier, Identifier]) -> bool:
return not bool(set(self).intersection(other))
def compatible(self, other: t.Mapping[Identifier, Identifier]) -> bool:
for k in self:
try:
if self[k] != other[k]:
return False
except KeyError:
pass
return True
def merge(self, other: t.Mapping[Identifier, Identifier]) -> FrozenDict:
res = FrozenDict(itertools.chain(self.items(), other.items()))
return res
def __str__(self) -> str:
return str(self._d)
def __repr__(self) -> str:
return repr(self._d)
class FrozenBindings(FrozenDict):
def __init__(self, ctx: QueryContext, *args, **kwargs):
FrozenDict.__init__(self, *args, **kwargs)
self.ctx = ctx
def __getitem__(self, key: Union[Identifier, str]) -> Identifier:
if not isinstance(key, Node):
key = Variable(key)
if not isinstance(key, (BNode, Variable)):
return key
if key not in self._d:
# type error: Value of type "Optional[Dict[Variable, Identifier]]" is not indexable
# type error: Invalid index type "Union[BNode, Variable]" for "Optional[Dict[Variable, Identifier]]"; expected type "Variable"
return self.ctx.initBindings[key] # type: ignore[index]
else:
return self._d[key]
def project(self, vars: Container[Variable]) -> FrozenBindings:
return FrozenBindings(self.ctx, (x for x in self.items() if x[0] in vars))
def merge(self, other: t.Mapping[Identifier, Identifier]) -> FrozenBindings:
res = FrozenBindings(self.ctx, itertools.chain(self.items(), other.items()))
return res
@property
def now(self) -> datetime.datetime:
return self.ctx.now
@property
def bnodes(self) -> t.Mapping[Identifier, BNode]:
return self.ctx.bnodes
@property
def prologue(self) -> Optional[Prologue]:
return self.ctx.prologue
def forget(
self, before: QueryContext, _except: Optional[Container[Variable]] = None
) -> FrozenBindings:
"""
return a frozen dict only of bindings made in self
since before
"""
if not _except:
_except = []
# bindings from initBindings are newer forgotten
return FrozenBindings(
self.ctx,
(
x
for x in self.items()
if (
x[0] in _except
# type error: Unsupported right operand type for in ("Optional[Dict[Variable, Identifier]]")
or x[0] in self.ctx.initBindings # type: ignore[operator]
or before[x[0]] is None
)
),
)
def remember(self, these) -> FrozenBindings:
"""
return a frozen dict only of bindings in these
"""
return FrozenBindings(self.ctx, (x for x in self.items() if x[0] in these))
class QueryContext:
"""
Query context - passed along when evaluating the query
"""
def __init__(
self,
graph: Optional[Graph] = None,
bindings: Optional[Union[Bindings, FrozenBindings, List[Any]]] = None,
initBindings: Optional[Mapping[str, Identifier]] = None,
datasetClause=None,
):
self.initBindings = initBindings
self.bindings = Bindings(d=bindings or [])
if initBindings:
self.bindings.update(initBindings)
self.graph: Optional[Graph]
self._dataset: Optional[Union[Dataset, ConjunctiveGraph]]
if isinstance(graph, (Dataset, ConjunctiveGraph)):
if datasetClause:
self._dataset = Dataset()
self.graph = Graph()
for d in datasetClause:
if d.default:
from_graph = graph.get_context(d.default)
self.graph += from_graph
if not from_graph:
self.load(d.default, default=True)
elif d.named:
namedGraphs = Graph(
store=self.dataset.store, identifier=d.named
)
from_named_graphs = graph.get_context(d.named)
namedGraphs += from_named_graphs
if not from_named_graphs:
self.load(d.named, default=False)
else:
self._dataset = graph
if rdflib.plugins.sparql.SPARQL_DEFAULT_GRAPH_UNION:
self.graph = self.dataset
else:
self.graph = self.dataset.default_context
else:
self._dataset = None
self.graph = graph
self.prologue: Optional[Prologue] = None
self._now: Optional[datetime.datetime] = None
self.bnodes: t.MutableMapping[Identifier, BNode] = collections.defaultdict(
BNode
)
@property
def now(self) -> datetime.datetime:
if self._now is None:
self._now = datetime.datetime.now(datetime.timezone.utc)
return self._now
def clone(
self, bindings: Optional[Union[FrozenBindings, Bindings, List[Any]]] = None
) -> QueryContext:
r = QueryContext(
self._dataset if self._dataset is not None else self.graph,
bindings or self.bindings,
initBindings=self.initBindings,
)
r.prologue = self.prologue
r.graph = self.graph
r.bnodes = self.bnodes
return r
@property
def dataset(self) -> ConjunctiveGraph:
""" "current dataset"""
if self._dataset is None:
raise Exception(
"You performed a query operation requiring "
+ "a dataset (i.e. ConjunctiveGraph), but "
+ "operating currently on a single graph."
)
return self._dataset
def load(
self,
source: URIRef,
default: bool = False,
into: Optional[Identifier] = None,
**kwargs: Any,
) -> None:
"""
Load data from the source into the query context's.
:param source: The source to load from.
:param default: If `True`, triples from the source will be added
to the default graph, otherwise it will be loaded into a
graph with ``source`` URI as its name.
:param into: The name of the graph to load the data into. If
`None`, the source URI will be used as as the name of the
graph.
:param kwargs: Keyword arguments to pass to
:meth:`rdflib.graph.Graph.parse`.
"""
def _load(graph, source):
try:
return graph.parse(source, format="turtle", **kwargs)
except Exception:
pass
try:
return graph.parse(source, format="xml", **kwargs)
except Exception:
pass
try:
return graph.parse(source, format="n3", **kwargs)
except Exception:
pass
try:
return graph.parse(source, format="nt", **kwargs)
except Exception:
raise Exception(
"Could not load %s as either RDF/XML, N3 or NTriples" % source
)
if not rdflib.plugins.sparql.SPARQL_LOAD_GRAPHS:
# we are not loading - if we already know the graph
# being "loaded", just add it to the default-graph
if default:
# Unsupported left operand type for + ("None")
self.graph += self.dataset.get_context(source) # type: ignore[operator]
else:
if default:
_load(self.graph, source)
else:
if into is None:
into = source
_load(self.dataset.get_context(into), source)
def __getitem__(self, key: Union[str, Path]) -> Optional[Union[str, Path]]:
# in SPARQL BNodes are just labels
if not isinstance(key, (BNode, Variable)):
return key
try:
return self.bindings[key]
except KeyError:
return None
def get(self, key: str, default: Optional[Any] = None) -> Any:
try:
return self[key]
except KeyError:
return default
def solution(self, vars: Optional[Iterable[Variable]] = None) -> FrozenBindings:
"""
Return a static copy of the current variable bindings as dict
"""
if vars:
return FrozenBindings(
self, ((k, v) for k, v in self.bindings.items() if k in vars)
)
else:
return FrozenBindings(self, self.bindings.items())
def __setitem__(self, key: str, value: str) -> None:
if key in self.bindings and self.bindings[key] != value:
raise AlreadyBound()
self.bindings[key] = value
def pushGraph(self, graph: Optional[Graph]) -> QueryContext:
r = self.clone()
r.graph = graph
return r
def push(self) -> QueryContext:
r = self.clone(Bindings(self.bindings))
return r
def clean(self) -> QueryContext:
return self.clone([])
def thaw(self, frozenbindings: FrozenBindings) -> QueryContext:
"""
Create a new read/write query context from the given solution
"""
c = self.clone(frozenbindings)
return c
class Prologue:
"""
A class for holding prefixing bindings and base URI information
"""
def __init__(self) -> None:
self.base: Optional[str] = None
self.namespace_manager = NamespaceManager(Graph()) # ns man needs a store
def resolvePName(self, prefix: Optional[str], localname: Optional[str]) -> URIRef:
ns = self.namespace_manager.store.namespace(prefix or "")
if ns is None:
raise Exception("Unknown namespace prefix : %s" % prefix)
return URIRef(ns + (localname or ""))
def bind(self, prefix: Optional[str], uri: Any) -> None:
self.namespace_manager.bind(prefix, uri, replace=True)
def absolutize(
self, iri: Optional[Union[CompValue, str]]
) -> Optional[Union[CompValue, str]]:
"""
Apply BASE / PREFIXes to URIs
(and to datatypes in Literals)
TODO: Move resolving URIs to pre-processing
"""
if isinstance(iri, CompValue):
if iri.name == "pname":
return self.resolvePName(iri.prefix, iri.localname)
if iri.name == "literal":
# type error: Argument "datatype" to "Literal" has incompatible type "Union[CompValue, Identifier, None]"; expected "Optional[str]"
return Literal(
iri.string, lang=iri.lang, datatype=self.absolutize(iri.datatype) # type: ignore[arg-type]
)
elif isinstance(iri, URIRef) and not ":" in iri: # noqa: E713
return URIRef(iri, base=self.base)
return iri
class Query:
"""
A parsed and translated query
"""
def __init__(self, prologue: Prologue, algebra: CompValue):
self.prologue = prologue
self.algebra = algebra
self._original_args: Tuple[str, Mapping[str, str], Optional[str]]
class Update:
"""
A parsed and translated update
"""
def __init__(self, prologue: Prologue, algebra: List[CompValue]):
self.prologue = prologue
self.algebra = algebra
self._original_args: Tuple[str, Mapping[str, str], Optional[str]]
@@ -0,0 +1,353 @@
"""
Code for carrying out Update Operations
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Iterator, Mapping, Optional, Sequence
from rdflib.graph import Graph
from rdflib.plugins.sparql.evaluate import evalBGP, evalPart
from rdflib.plugins.sparql.evalutils import _fillTemplate, _join
from rdflib.plugins.sparql.parserutils import CompValue
from rdflib.plugins.sparql.sparql import FrozenDict, QueryContext, Update
from rdflib.term import Identifier, URIRef, Variable
def _graphOrDefault(ctx: QueryContext, g: str) -> Optional[Graph]:
if g == "DEFAULT":
return ctx.graph
else:
return ctx.dataset.get_context(g)
def _graphAll(ctx: QueryContext, g: str) -> Sequence[Graph]:
"""
return a list of graphs
"""
if g == "DEFAULT":
# type error: List item 0 has incompatible type "Optional[Graph]"; expected "Graph"
return [ctx.graph] # type: ignore[list-item]
elif g == "NAMED":
return [
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
c
for c in ctx.dataset.contexts()
if c.identifier != ctx.graph.identifier # type: ignore[union-attr]
]
elif g == "ALL":
return list(ctx.dataset.contexts())
else:
return [ctx.dataset.get_context(g)]
def evalLoad(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#load
"""
if TYPE_CHECKING:
assert isinstance(u.iri, URIRef)
if u.graphiri:
ctx.load(u.iri, default=False, into=u.graphiri)
else:
ctx.load(u.iri, default=True)
def evalCreate(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#create
"""
g = ctx.dataset.get_context(u.graphiri)
if len(g) > 0:
raise Exception("Graph %s already exists." % g.identifier)
raise Exception("Create not implemented!")
def evalClear(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#clear
"""
for g in _graphAll(ctx, u.graphiri):
g.remove((None, None, None))
def evalDrop(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#drop
"""
if ctx.dataset.store.graph_aware:
for g in _graphAll(ctx, u.graphiri):
ctx.dataset.store.remove_graph(g)
else:
evalClear(ctx, u)
def evalInsertData(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#insertData
"""
# add triples
g = ctx.graph
g += u.triples
# add quads
# u.quads is a dict of graphURI=>[triples]
for g in u.quads:
# type error: Argument 1 to "get_context" of "ConjunctiveGraph" has incompatible type "Optional[Graph]"; expected "Union[IdentifiedNode, str, None]"
cg = ctx.dataset.get_context(g) # type: ignore[arg-type]
cg += u.quads[g]
def evalDeleteData(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#deleteData
"""
# remove triples
g = ctx.graph
g -= u.triples
# remove quads
# u.quads is a dict of graphURI=>[triples]
for g in u.quads:
# type error: Argument 1 to "get_context" of "ConjunctiveGraph" has incompatible type "Optional[Graph]"; expected "Union[IdentifiedNode, str, None]"
cg = ctx.dataset.get_context(g) # type: ignore[arg-type]
cg -= u.quads[g]
def evalDeleteWhere(ctx: QueryContext, u: CompValue) -> None:
"""
http://www.w3.org/TR/sparql11-update/#deleteWhere
"""
res: Iterator[FrozenDict] = evalBGP(ctx, u.triples)
for g in u.quads:
cg = ctx.dataset.get_context(g)
c = ctx.pushGraph(cg)
res = _join(res, list(evalBGP(c, u.quads[g])))
# type error: Incompatible types in assignment (expression has type "FrozenBindings", variable has type "QueryContext")
for c in res: # type: ignore[assignment]
g = ctx.graph
g -= _fillTemplate(u.triples, c)
for g in u.quads:
cg = ctx.dataset.get_context(c.get(g))
cg -= _fillTemplate(u.quads[g], c)
def evalModify(ctx: QueryContext, u: CompValue) -> None:
originalctx = ctx
# Using replaces the dataset for evaluating the where-clause
dg: Optional[Graph]
if u.using:
otherDefault = False
for d in u.using:
if d.default:
if not otherDefault:
# replace current default graph
dg = Graph()
ctx = ctx.pushGraph(dg)
otherDefault = True
ctx.load(d.default, default=True)
elif d.named:
g = d.named
ctx.load(g, default=False)
# "The WITH clause provides a convenience for when an operation
# primarily refers to a single graph. If a graph name is specified
# in a WITH clause, then - for the purposes of evaluating the
# WHERE clause - this will define an RDF Dataset containing a
# default graph with the specified name, but only in the absence
# of USING or USING NAMED clauses. In the presence of one or more
# graphs referred to in USING clauses and/or USING NAMED clauses,
# the WITH clause will be ignored while evaluating the WHERE
# clause."
if not u.using and u.withClause:
g = ctx.dataset.get_context(u.withClause)
ctx = ctx.pushGraph(g)
res = evalPart(ctx, u.where)
if u.using:
if otherDefault:
ctx = originalctx # restore original default graph
if u.withClause:
g = ctx.dataset.get_context(u.withClause)
ctx = ctx.pushGraph(g)
for c in res:
dg = ctx.graph
if u.delete:
# type error: Unsupported left operand type for - ("None")
# type error: Unsupported operand types for - ("Graph" and "Generator[Tuple[Identifier, Identifier, Identifier], None, None]")
dg -= _fillTemplate(u.delete.triples, c) # type: ignore[operator]
for g, q in u.delete.quads.items():
cg = ctx.dataset.get_context(c.get(g))
cg -= _fillTemplate(q, c)
if u.insert:
# type error: Unsupported left operand type for + ("None")
# type error: Unsupported operand types for + ("Graph" and "Generator[Tuple[Identifier, Identifier, Identifier], None, None]")
dg += _fillTemplate(u.insert.triples, c) # type: ignore[operator]
for g, q in u.insert.quads.items():
cg = ctx.dataset.get_context(c.get(g))
cg += _fillTemplate(q, c)
def evalAdd(ctx: QueryContext, u: CompValue) -> None:
"""
add all triples from src to dst
http://www.w3.org/TR/sparql11-update/#add
"""
src, dst = u.graph
srcg = _graphOrDefault(ctx, src)
dstg = _graphOrDefault(ctx, dst)
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
if srcg.identifier == dstg.identifier: # type: ignore[union-attr]
return
# type error: Unsupported left operand type for + ("None")
dstg += srcg # type: ignore[operator]
def evalMove(ctx: QueryContext, u: CompValue) -> None:
"""
remove all triples from dst
add all triples from src to dst
remove all triples from src
http://www.w3.org/TR/sparql11-update/#move
"""
src, dst = u.graph
srcg = _graphOrDefault(ctx, src)
dstg = _graphOrDefault(ctx, dst)
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
if srcg.identifier == dstg.identifier: # type: ignore[union-attr]
return
# type error: Item "None" of "Optional[Graph]" has no attribute "remove"
dstg.remove((None, None, None)) # type: ignore[union-attr]
# type error: Unsupported left operand type for + ("None")
dstg += srcg # type: ignore[operator]
if ctx.dataset.store.graph_aware:
# type error: Argument 1 to "remove_graph" of "Store" has incompatible type "Optional[Graph]"; expected "Graph"
ctx.dataset.store.remove_graph(srcg) # type: ignore[arg-type]
else:
# type error: Item "None" of "Optional[Graph]" has no attribute "remove"
srcg.remove((None, None, None)) # type: ignore[union-attr]
def evalCopy(ctx: QueryContext, u: CompValue) -> None:
"""
remove all triples from dst
add all triples from src to dst
http://www.w3.org/TR/sparql11-update/#copy
"""
src, dst = u.graph
srcg = _graphOrDefault(ctx, src)
dstg = _graphOrDefault(ctx, dst)
# type error: Item "None" of "Optional[Graph]" has no attribute "remove"
if srcg.identifier == dstg.identifier: # type: ignore[union-attr]
return
# type error: Item "None" of "Optional[Graph]" has no attribute "remove"
dstg.remove((None, None, None)) # type: ignore[union-attr]
# type error: Unsupported left operand type for + ("None")
dstg += srcg # type: ignore[operator]
def evalUpdate(
graph: Graph,
update: Update,
initBindings: Optional[Mapping[str, Identifier]] = None,
) -> None:
"""
http://www.w3.org/TR/sparql11-update/#updateLanguage
'A request is a sequence of operations [...] Implementations MUST
ensure that operations of a single request are executed in a
fashion that guarantees the same effects as executing them in
lexical order.
Operations all result either in success or failure.
If multiple operations are present in a single request, then a
result of failure from any operation MUST abort the sequence of
operations, causing the subsequent operations to be ignored.'
This will return None on success and raise Exceptions on error
.. caution::
This method can access indirectly requested network endpoints, for
example, query processing will attempt to access network endpoints
specified in ``SERVICE`` directives.
When processing untrusted or potentially malicious queries, measures
should be taken to restrict network and file access.
For information on available security measures, see the RDFLib
:doc:`Security Considerations </security_considerations>`
documentation.
"""
for u in update.algebra:
initBindings = dict((Variable(k), v) for k, v in (initBindings or {}).items())
ctx = QueryContext(graph, initBindings=initBindings)
ctx.prologue = u.prologue
try:
if u.name == "Load":
evalLoad(ctx, u)
elif u.name == "Clear":
evalClear(ctx, u)
elif u.name == "Drop":
evalDrop(ctx, u)
elif u.name == "Create":
evalCreate(ctx, u)
elif u.name == "Add":
evalAdd(ctx, u)
elif u.name == "Move":
evalMove(ctx, u)
elif u.name == "Copy":
evalCopy(ctx, u)
elif u.name == "InsertData":
evalInsertData(ctx, u)
elif u.name == "DeleteData":
evalDeleteData(ctx, u)
elif u.name == "DeleteWhere":
evalDeleteWhere(ctx, u)
elif u.name == "Modify":
evalModify(ctx, u)
else:
raise Exception("Unknown update operation: %s" % (u,))
except: # noqa: E722
if not u.silent:
raise
@@ -0,0 +1,3 @@
"""
This package contains modules for additional RDFLib stores
"""
@@ -0,0 +1,199 @@
"""
This wrapper intercepts calls through the store interface and implements
thread-safe logging of destructive operations (adds / removes) in reverse.
This is persisted on the store instance and the reverse operations are
executed In order to return the store to the state it was when the transaction
began Since the reverse operations are persisted on the store, the store
itself acts as a transaction.
Calls to commit or rollback, flush the list of reverse operations This
provides thread-safe atomicity and isolation (assuming concurrent operations
occur with different store instances), but no durability (transactions are
persisted in memory and won't be available to reverse operations after the
system fails): A and I out of ACID.
"""
from __future__ import annotations
import threading
from typing import TYPE_CHECKING, Any, Generator, Iterator, List, Optional, Tuple
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.store import Store
if TYPE_CHECKING:
from rdflib.graph import (
_ContextIdentifierType,
_ContextType,
_ObjectType,
_PredicateType,
_SubjectType,
_TriplePatternType,
_TripleType,
)
from rdflib.query import Result
from rdflib.term import URIRef
destructiveOpLocks = { # noqa: N816
"add": None,
"remove": None,
}
class AuditableStore(Store):
def __init__(self, store: Store):
self.store = store
self.context_aware = store.context_aware
# NOTE: this store can't be formula_aware as it doesn't have enough
# info to reverse the removal of a quoted statement
self.formula_aware = False # store.formula_aware
self.transaction_aware = True # This is only half true
self.reverseOps: List[
Tuple[
Optional[_SubjectType],
Optional[_PredicateType],
Optional[_ObjectType],
Optional[_ContextIdentifierType],
str,
]
] = []
self.rollbackLock = threading.RLock()
def open(self, configuration: str, create: bool = True) -> Optional[int]:
return self.store.open(configuration, create)
def close(self, commit_pending_transaction: bool = False) -> None:
self.store.close()
def destroy(self, configuration: str) -> None:
self.store.destroy(configuration)
def query(self, *args: Any, **kw: Any) -> Result:
return self.store.query(*args, **kw)
def add(
self, triple: _TripleType, context: _ContextType, quoted: bool = False
) -> None:
(s, p, o) = triple
lock = destructiveOpLocks["add"]
lock = lock if lock else threading.RLock()
with lock:
context = (
context.__class__(self.store, context.identifier)
if context is not None
else None
)
ctxId = context.identifier if context is not None else None # noqa: N806
if list(self.store.triples(triple, context)):
return # triple already in store, do nothing
self.reverseOps.append((s, p, o, ctxId, "remove"))
try:
self.reverseOps.remove((s, p, o, ctxId, "add"))
except ValueError:
pass
self.store.add((s, p, o), context, quoted)
def remove(
self, spo: _TriplePatternType, context: Optional[_ContextType] = None
) -> None:
subject, predicate, object_ = spo
lock = destructiveOpLocks["remove"]
lock = lock if lock else threading.RLock()
with lock:
# Need to determine which quads will be removed if any term is a
# wildcard
context = (
context.__class__(self.store, context.identifier)
if context is not None
else None
)
ctxId = context.identifier if context is not None else None # noqa: N806
if None in [subject, predicate, object_, context]:
if ctxId:
# type error: Item "None" of "Optional[Graph]" has no attribute "triples"
for s, p, o in context.triples((subject, predicate, object_)): # type: ignore[union-attr]
try:
self.reverseOps.remove((s, p, o, ctxId, "remove"))
except ValueError:
self.reverseOps.append((s, p, o, ctxId, "add"))
else:
for s, p, o, ctx in ConjunctiveGraph(self.store).quads(
(subject, predicate, object_)
):
try:
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
self.reverseOps.remove((s, p, o, ctx.identifier, "remove")) # type: ignore[union-attr]
except ValueError:
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
self.reverseOps.append((s, p, o, ctx.identifier, "add")) # type: ignore[union-attr]
else:
if not list(self.triples((subject, predicate, object_), context)):
return # triple not present in store, do nothing
try:
self.reverseOps.remove(
(subject, predicate, object_, ctxId, "remove")
)
except ValueError:
self.reverseOps.append((subject, predicate, object_, ctxId, "add"))
self.store.remove((subject, predicate, object_), context)
def triples(
self, triple: _TriplePatternType, context: Optional[_ContextType] = None
) -> Iterator[Tuple[_TripleType, Iterator[Optional[_ContextType]]]]:
(su, pr, ob) = triple
context = (
context.__class__(self.store, context.identifier)
if context is not None
else None
)
for (s, p, o), cg in self.store.triples((su, pr, ob), context):
yield (s, p, o), cg
def __len__(self, context: Optional[_ContextType] = None):
context = (
context.__class__(self.store, context.identifier)
if context is not None
else None
)
return self.store.__len__(context)
def contexts(
self, triple: Optional[_TripleType] = None
) -> Generator[_ContextType, None, None]:
for ctx in self.store.contexts(triple):
yield ctx
def bind(self, prefix: str, namespace: URIRef, override: bool = True) -> None:
self.store.bind(prefix, namespace, override=override)
def prefix(self, namespace: URIRef) -> Optional[str]:
return self.store.prefix(namespace)
def namespace(self, prefix: str) -> Optional[URIRef]:
return self.store.namespace(prefix)
def namespaces(self) -> Iterator[Tuple[str, URIRef]]:
return self.store.namespaces()
def commit(self) -> None:
self.reverseOps = []
def rollback(self) -> None:
# Acquire Rollback lock and apply reverse operations in the forward
# order
with self.rollbackLock:
for subject, predicate, obj, context, op in self.reverseOps:
if op == "add":
# type error: Argument 2 to "Graph" has incompatible type "Optional[Node]"; expected "Union[IdentifiedNode, str, None]"
self.store.add(
(subject, predicate, obj), Graph(self.store, context) # type: ignore[arg-type]
)
else:
self.store.remove(
(subject, predicate, obj), Graph(self.store, context)
)
self.reverseOps = []
@@ -0,0 +1,775 @@
from __future__ import annotations
import logging
from os import mkdir
from os.path import abspath, exists
from threading import Thread
from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Optional, Tuple
from urllib.request import pathname2url
from rdflib.store import NO_STORE, VALID_STORE, Store
from rdflib.term import Identifier, Node, URIRef
if TYPE_CHECKING:
from rdflib.graph import Graph, _ContextType, _TriplePatternType, _TripleType
def bb(u: str) -> bytes:
return u.encode("utf-8")
try:
from berkeleydb import db
has_bsddb = True
except ImportError:
has_bsddb = False
if has_bsddb:
# These are passed to bsddb when creating DBs
# passed to db.DBEnv.set_flags
ENVSETFLAGS = db.DB_CDB_ALLDB
# passed to db.DBEnv.open
ENVFLAGS = db.DB_INIT_MPOOL | db.DB_INIT_CDB | db.DB_THREAD
CACHESIZE = 1024 * 1024 * 50
# passed to db.DB.Open()
DBOPENFLAGS = db.DB_THREAD
logger = logging.getLogger(__name__)
__all__ = [
"BerkeleyDB",
"_ToKeyFunc",
"_FromKeyFunc",
"_GetPrefixFunc",
"_ResultsFromKeyFunc",
]
_ToKeyFunc = Callable[[Tuple[bytes, bytes, bytes], bytes], bytes]
_FromKeyFunc = Callable[[bytes], Tuple[bytes, bytes, bytes, bytes]]
_GetPrefixFunc = Callable[
[Tuple[str, str, str], Optional[str]], Generator[str, None, None]
]
_ResultsFromKeyFunc = Callable[
[bytes, Optional[Node], Optional[Node], Optional[Node], bytes],
Tuple[Tuple[Node, Node, Node], Generator[Node, None, None]],
]
class BerkeleyDB(Store):
"""\
A store that allows for on-disk persistent using BerkeleyDB, a fast
key/value DB.
This store implementation used to be known, previous to rdflib 6.0.0
as 'Sleepycat' due to that being the then name of the Python wrapper
for BerkeleyDB.
This store allows for quads as well as triples. See examples of use
in both the `examples.berkeleydb_example` and ``test/test_store/test_store_berkeleydb.py``
files.
**NOTE on installation**:
To use this store, you must have BerkeleyDB installed on your system
separately to Python (``brew install berkeley-db`` on a Mac) and also have
the BerkeleyDB Python wrapper installed (``pip install berkeleydb``).
You may need to install BerkeleyDB Python wrapper like this:
``YES_I_HAVE_THE_RIGHT_TO_USE_THIS_BERKELEY_DB_VERSION=1 pip install berkeleydb``
"""
context_aware = True
formula_aware = True
transaction_aware = False
graph_aware = True
db_env: db.DBEnv = None
def __init__(
self,
configuration: Optional[str] = None,
identifier: Optional[Identifier] = None,
):
if not has_bsddb:
raise ImportError("Unable to import berkeleydb, store is unusable.")
self.__open = False
self.__identifier = identifier
super(BerkeleyDB, self).__init__(configuration)
self._loads = self.node_pickler.loads
self._dumps = self.node_pickler.dumps
self.__indicies_info: List[Tuple[Any, _ToKeyFunc, _FromKeyFunc]]
def __get_identifier(self) -> Optional[Identifier]:
return self.__identifier
identifier = property(__get_identifier)
def _init_db_environment(
self, homeDir: str, create: bool = True # noqa: N803
) -> db.DBEnv:
if not exists(homeDir):
if create is True:
mkdir(homeDir)
# TODO: implement create method and refactor this to it
self.create(homeDir)
else:
return NO_STORE
db_env = db.DBEnv()
db_env.set_cachesize(0, CACHESIZE) # TODO
# db_env.set_lg_max(1024*1024)
db_env.set_flags(ENVSETFLAGS, 1)
db_env.open(homeDir, ENVFLAGS | db.DB_CREATE)
return db_env
def is_open(self) -> bool:
return self.__open
def open(self, path: str, create: bool = True) -> Optional[int]:
if not has_bsddb:
return NO_STORE
homeDir = path # noqa: N806
if self.__identifier is None:
self.__identifier = URIRef(pathname2url(abspath(homeDir)))
db_env = self._init_db_environment(homeDir, create)
if db_env == NO_STORE:
return NO_STORE
self.db_env = db_env
self.__open = True
dbname = None
dbtype = db.DB_BTREE
# auto-commit ensures that the open-call commits when transactions
# are enabled
dbopenflags = DBOPENFLAGS
if self.transaction_aware is True:
dbopenflags |= db.DB_AUTO_COMMIT
if create:
dbopenflags |= db.DB_CREATE
dbmode = 0o660
dbsetflags = 0
# create and open the DBs
self.__indicies: List[db.DB] = [
None,
] * 3
# NOTE on type ingore: this is because type checker does not like this
# way of initializing, using a temporary variable will solve it.
# type error: error: List item 0 has incompatible type "None"; expected "Tuple[Any, Callable[[Tuple[bytes, bytes, bytes], bytes], bytes], Callable[[bytes], Tuple[bytes, bytes, bytes, bytes]]]"
self.__indicies_info = [
None, # type: ignore[list-item]
] * 3
for i in range(0, 3):
index_name = to_key_func(i)(
("s".encode("latin-1"), "p".encode("latin-1"), "o".encode("latin-1")),
"c".encode("latin-1"),
).decode()
index = db.DB(db_env)
index.set_flags(dbsetflags)
index.open(index_name, dbname, dbtype, dbopenflags, dbmode)
self.__indicies[i] = index
self.__indicies_info[i] = (index, to_key_func(i), from_key_func(i))
lookup: Dict[
int, Tuple[db.DB, _GetPrefixFunc, _FromKeyFunc, _ResultsFromKeyFunc]
] = {}
for i in range(0, 8):
results: List[Tuple[Tuple[int, int], int, int]] = []
for start in range(0, 3):
score = 1
len = 0
for j in range(start, start + 3):
if i & (1 << (j % 3)):
score = score << 1
len += 1
else:
break
tie_break = 2 - start
results.append(((score, tie_break), start, len))
results.sort()
# NOTE on type error: this is because the variable `score` is
# reused with different type
# type error: Incompatible types in assignment (expression has type "Tuple[int, int]", variable has type "int")
score, start, len = results[-1] # type: ignore[assignment]
def get_prefix_func(start: int, end: int) -> _GetPrefixFunc:
def get_prefix(
triple: Tuple[str, str, str], context: Optional[str]
) -> Generator[str, None, None]:
if context is None:
yield ""
else:
yield context
i = start
while i < end:
yield triple[i % 3]
i += 1
yield ""
return get_prefix
lookup[i] = (
self.__indicies[start],
get_prefix_func(start, start + len),
from_key_func(start),
results_from_key_func(start, self._from_string),
)
self.__lookup_dict = lookup
self.__contexts = db.DB(db_env)
self.__contexts.set_flags(dbsetflags)
self.__contexts.open("contexts", dbname, dbtype, dbopenflags, dbmode)
self.__namespace = db.DB(db_env)
self.__namespace.set_flags(dbsetflags)
self.__namespace.open("namespace", dbname, dbtype, dbopenflags, dbmode)
self.__prefix = db.DB(db_env)
self.__prefix.set_flags(dbsetflags)
self.__prefix.open("prefix", dbname, dbtype, dbopenflags, dbmode)
self.__k2i = db.DB(db_env)
self.__k2i.set_flags(dbsetflags)
self.__k2i.open("k2i", dbname, db.DB_HASH, dbopenflags, dbmode)
self.__i2k = db.DB(db_env)
self.__i2k.set_flags(dbsetflags)
self.__i2k.open("i2k", dbname, db.DB_RECNO, dbopenflags, dbmode)
self.__needs_sync = False
t = Thread(target=self.__sync_run)
t.setDaemon(True)
t.start()
self.__sync_thread = t
return VALID_STORE
def __sync_run(self) -> None:
from time import sleep, time
try:
min_seconds, max_seconds = 10, 300
while self.__open:
if self.__needs_sync:
t0 = t1 = time()
self.__needs_sync = False
while self.__open:
sleep(0.1)
if self.__needs_sync:
t1 = time()
self.__needs_sync = False
if time() - t1 > min_seconds or time() - t0 > max_seconds:
self.__needs_sync = False
logger.debug("sync")
self.sync()
break
else:
sleep(1)
except Exception as e:
logger.exception(e)
def sync(self) -> None:
if self.__open:
for i in self.__indicies:
i.sync()
self.__contexts.sync()
self.__namespace.sync()
self.__prefix.sync()
self.__i2k.sync()
self.__k2i.sync()
def close(self, commit_pending_transaction: bool = False) -> None:
self.__open = False
self.__sync_thread.join()
for i in self.__indicies:
i.close()
self.__contexts.close()
self.__namespace.close()
self.__prefix.close()
self.__i2k.close()
self.__k2i.close()
self.db_env.close()
def add(
self,
triple: _TripleType,
context: _ContextType,
quoted: bool = False,
txn: Optional[Any] = None,
) -> None:
"""\
Add a triple to the store of triples.
"""
(subject, predicate, object) = triple
assert self.__open, "The Store must be open."
assert context != self, "Can not add triple directly to store"
Store.add(self, (subject, predicate, object), context, quoted)
_to_string = self._to_string
s = _to_string(subject, txn=txn)
p = _to_string(predicate, txn=txn)
o = _to_string(object, txn=txn)
c = _to_string(context, txn=txn)
cspo, cpos, cosp = self.__indicies
value = cspo.get(bb("%s^%s^%s^%s^" % (c, s, p, o)), txn=txn)
if value is None:
self.__contexts.put(bb(c), b"", txn=txn)
contexts_value = cspo.get(
bb("%s^%s^%s^%s^" % ("", s, p, o)), txn=txn
) or "".encode("latin-1")
contexts = set(contexts_value.split("^".encode("latin-1")))
contexts.add(bb(c))
contexts_value = "^".encode("latin-1").join(contexts)
assert contexts_value is not None
cspo.put(bb("%s^%s^%s^%s^" % (c, s, p, o)), b"", txn=txn)
cpos.put(bb("%s^%s^%s^%s^" % (c, p, o, s)), b"", txn=txn)
cosp.put(bb("%s^%s^%s^%s^" % (c, o, s, p)), b"", txn=txn)
if not quoted:
cspo.put(bb("%s^%s^%s^%s^" % ("", s, p, o)), contexts_value, txn=txn)
cpos.put(bb("%s^%s^%s^%s^" % ("", p, o, s)), contexts_value, txn=txn)
cosp.put(bb("%s^%s^%s^%s^" % ("", o, s, p)), contexts_value, txn=txn)
self.__needs_sync = True
def __remove(
self,
spo: Tuple[bytes, bytes, bytes],
c: bytes,
quoted: bool = False,
txn: Optional[Any] = None,
) -> None:
s, p, o = spo
cspo, cpos, cosp = self.__indicies
contexts_value = cspo.get(
"^".encode("latin-1").join(
["".encode("latin-1"), s, p, o, "".encode("latin-1")]
),
txn=txn,
) or "".encode("latin-1")
contexts = set(contexts_value.split("^".encode("latin-1")))
contexts.discard(c)
contexts_value = "^".encode("latin-1").join(contexts)
for i, _to_key, _from_key in self.__indicies_info:
i.delete(_to_key((s, p, o), c), txn=txn)
if not quoted:
if contexts_value:
for i, _to_key, _from_key in self.__indicies_info:
i.put(
_to_key((s, p, o), "".encode("latin-1")),
contexts_value,
txn=txn,
)
else:
for i, _to_key, _from_key in self.__indicies_info:
try:
i.delete(_to_key((s, p, o), "".encode("latin-1")), txn=txn)
except db.DBNotFoundError:
pass # TODO: is it okay to ignore these?
# type error: Signature of "remove" incompatible with supertype "Store"
def remove( # type: ignore[override]
self,
spo: _TriplePatternType,
context: Optional[_ContextType],
txn: Optional[Any] = None,
) -> None:
subject, predicate, object = spo
assert self.__open, "The Store must be open."
Store.remove(self, (subject, predicate, object), context)
_to_string = self._to_string
if context is not None:
if context == self:
context = None
if (
subject is not None
and predicate is not None
and object is not None
and context is not None
):
s = _to_string(subject, txn=txn)
p = _to_string(predicate, txn=txn)
o = _to_string(object, txn=txn)
c = _to_string(context, txn=txn)
value = self.__indicies[0].get(bb("%s^%s^%s^%s^" % (c, s, p, o)), txn=txn)
if value is not None:
self.__remove((bb(s), bb(p), bb(o)), bb(c), txn=txn)
self.__needs_sync = True
else:
cspo, cpos, cosp = self.__indicies
index, prefix, from_key, results_from_key = self.__lookup(
(subject, predicate, object), context, txn=txn
)
cursor = index.cursor(txn=txn)
try:
current = cursor.set_range(prefix)
needs_sync = True
except db.DBNotFoundError:
current = None
needs_sync = False
cursor.close()
while current:
key, value = current
cursor = index.cursor(txn=txn)
try:
cursor.set_range(key)
current = cursor.next
except db.DBNotFoundError:
current = None
cursor.close()
if key.startswith(prefix):
# NOTE on type error: variables are being reused with a
# different type
# type error: Incompatible types in assignment (expression has type "bytes", variable has type "str")
c, s, p, o = from_key(key) # type: ignore[assignment]
if context is None:
contexts_value = index.get(key, txn=txn) or "".encode("latin-1")
# remove triple from all non quoted contexts
contexts = set(contexts_value.split("^".encode("latin-1")))
# and from the conjunctive index
contexts.add("".encode("latin-1"))
for c in contexts:
for i, _to_key, _ in self.__indicies_info:
# NOTE on type error: variables are being
# reused with a different type
# type error: Argument 1 has incompatible type "Tuple[str, str, str]"; expected "Tuple[bytes, bytes, bytes]"
# type error: Argument 2 has incompatible type "str"; expected "bytes"
i.delete(_to_key((s, p, o), c), txn=txn) # type: ignore[arg-type]
else:
# type error: Argument 1 to "__remove" of "BerkeleyDB" has incompatible type "Tuple[str, str, str]"; expected "Tuple[bytes, bytes, bytes]"
# type error: Argument 2 to "__remove" of "BerkeleyDB" has incompatible type "str"; expected "bytes"
self.__remove((s, p, o), c, txn=txn) # type: ignore[arg-type]
else:
break
if context is not None:
if subject is None and predicate is None and object is None:
# TODO: also if context becomes empty and not just on
# remove((None, None, None), c)
try:
self.__contexts.delete(
bb(_to_string(context, txn=txn)), txn=txn
)
except db.DBNotFoundError:
pass
self.__needs_sync = needs_sync
def triples(
self,
spo: _TriplePatternType,
context: Optional[_ContextType] = None,
txn: Optional[Any] = None,
) -> Generator[
Tuple[_TripleType, Generator[Optional[_ContextType], None, None]],
None,
None,
]:
"""A generator over all the triples matching"""
assert self.__open, "The Store must be open."
subject, predicate, object = spo
if context is not None:
if context == self:
context = None
# _from_string = self._from_string ## UNUSED
index, prefix, from_key, results_from_key = self.__lookup(
(subject, predicate, object), context, txn=txn
)
cursor = index.cursor(txn=txn)
try:
current = cursor.set_range(prefix)
except db.DBNotFoundError:
current = None
cursor.close()
while current:
key, value = current
cursor = index.cursor(txn=txn)
try:
cursor.set_range(key)
current = cursor.next
except db.DBNotFoundError:
current = None
cursor.close()
if key and key.startswith(prefix):
contexts_value = index.get(key, txn=txn)
# type error: Incompatible types in "yield" (actual type "Tuple[Tuple[Node, Node, Node], Generator[Node, None, None]]", expected type "Tuple[Tuple[IdentifiedNode, URIRef, Identifier], Iterator[Optional[Graph]]]")
# NOTE on type ignore: this is needed because some context is
# lost in the process of extracting triples from the database.
yield results_from_key(key, subject, predicate, object, contexts_value) # type: ignore[misc]
else:
break
def __len__(self, context: Optional[_ContextType] = None) -> int:
assert self.__open, "The Store must be open."
if context is not None:
if context == self:
context = None
if context is None:
prefix = "^".encode("latin-1")
else:
prefix = bb("%s^" % self._to_string(context))
index = self.__indicies[0]
cursor = index.cursor()
current = cursor.set_range(prefix)
count = 0
while current:
key, value = current
if key.startswith(prefix):
count += 1
current = cursor.next
else:
break
cursor.close()
return count
def bind(self, prefix: str, namespace: URIRef, override: bool = True) -> None:
# NOTE on type error: this is because the variables are reused with
# another type.
# type error: Incompatible types in assignment (expression has type "bytes", variable has type "str")
prefix = prefix.encode("utf-8") # type: ignore[assignment]
# type error: Incompatible types in assignment (expression has type "bytes", variable has type "URIRef")
namespace = namespace.encode("utf-8") # type: ignore[assignment]
bound_prefix = self.__prefix.get(namespace)
bound_namespace = self.__namespace.get(prefix)
if override:
if bound_prefix:
self.__namespace.delete(bound_prefix)
if bound_namespace:
self.__prefix.delete(bound_namespace)
self.__prefix[namespace] = prefix
self.__namespace[prefix] = namespace
else:
self.__prefix[bound_namespace or namespace] = bound_prefix or prefix
self.__namespace[bound_prefix or prefix] = bound_namespace or namespace
def namespace(self, prefix: str) -> Optional[URIRef]:
# NOTE on type error: this is because the variable is reused with
# another type.
# type error: Incompatible types in assignment (expression has type "bytes", variable has type "str")
prefix = prefix.encode("utf-8") # type: ignore[assignment]
ns = self.__namespace.get(prefix, None)
if ns is not None:
return URIRef(ns.decode("utf-8"))
return None
def prefix(self, namespace: URIRef) -> Optional[str]:
# NOTE on type error: this is because the variable is reused with
# another type.
# type error: Incompatible types in assignment (expression has type "bytes", variable has type "URIRef")
namespace = namespace.encode("utf-8") # type: ignore[assignment]
prefix = self.__prefix.get(namespace, None)
if prefix is not None:
return prefix.decode("utf-8")
return None
def namespaces(self) -> Generator[Tuple[str, URIRef], None, None]:
cursor = self.__namespace.cursor()
results = []
current = cursor.first()
while current:
prefix, namespace = current
results.append((prefix.decode("utf-8"), namespace.decode("utf-8")))
current = cursor.next
cursor.close()
for prefix, namespace in results:
yield prefix, URIRef(namespace)
def contexts(
self, triple: Optional[_TripleType] = None
) -> Generator[_ContextType, None, None]:
_from_string = self._from_string
_to_string = self._to_string
# NOTE on type errors: context is lost because of how data is loaded
# from the DB.
if triple:
s: str
p: str
o: str
# type error: Incompatible types in assignment (expression has type "Node", variable has type "str")
s, p, o = triple # type: ignore[assignment]
# type error: Argument 1 has incompatible type "str"; expected "Node"
s = _to_string(s) # type: ignore[arg-type]
# type error: Argument 1 has incompatible type "str"; expected "Node"
p = _to_string(p) # type: ignore[arg-type]
# type error: Argument 1 has incompatible type "str"; expected "Node"
o = _to_string(o) # type: ignore[arg-type]
contexts = self.__indicies[0].get(bb("%s^%s^%s^%s^" % ("", s, p, o)))
if contexts:
for c in contexts.split("^".encode("latin-1")):
if c:
# type error: Incompatible types in "yield" (actual type "Node", expected type "Graph")
yield _from_string(c) # type: ignore[misc]
else:
index = self.__contexts
cursor = index.cursor()
current = cursor.first()
cursor.close()
while current:
key, value = current
context = _from_string(key)
# type error: Incompatible types in "yield" (actual type "Node", expected type "Graph")
yield context # type: ignore[misc]
cursor = index.cursor()
try:
cursor.set_range(key)
current = cursor.next
except db.DBNotFoundError:
current = None
cursor.close()
def add_graph(self, graph: Graph) -> None:
self.__contexts.put(bb(self._to_string(graph)), b"")
def remove_graph(self, graph: Graph):
self.remove((None, None, None), graph)
def _from_string(self, i: bytes) -> Node:
k = self.__i2k.get(int(i))
return self._loads(k)
def _to_string(self, term: Node, txn: Optional[Any] = None) -> str:
k = self._dumps(term)
i = self.__k2i.get(k, txn=txn)
if i is None:
# weird behaviour from bsddb not taking a txn as a keyword argument
# for append
if self.transaction_aware:
i = "%s" % self.__i2k.append(k, txn)
else:
i = "%s" % self.__i2k.append(k)
self.__k2i.put(k, i.encode(), txn=txn)
else:
i = i.decode()
return i
def __lookup(
self,
spo: _TriplePatternType,
context: Optional[_ContextType],
txn: Optional[Any] = None,
) -> Tuple[db.DB, bytes, _FromKeyFunc, _ResultsFromKeyFunc]:
subject, predicate, object = spo
_to_string = self._to_string
# NOTE on type errors: this is because the same variable is used with different types.
if context is not None:
# type error: Incompatible types in assignment (expression has type "str", variable has type "Optional[Graph]")
context = _to_string(context, txn=txn) # type: ignore[assignment]
i = 0
if subject is not None:
i += 1
# type error: Incompatible types in assignment (expression has type "str", variable has type "Node")
subject = _to_string(subject, txn=txn) # type: ignore[assignment]
if predicate is not None:
i += 2
# type error: Incompatible types in assignment (expression has type "str", variable has type "Node")
predicate = _to_string(predicate, txn=txn) # type: ignore[assignment]
if object is not None:
i += 4
# type error: Incompatible types in assignment (expression has type "str", variable has type "Node")
object = _to_string(object, txn=txn) # type: ignore[assignment]
index, prefix_func, from_key, results_from_key = self.__lookup_dict[i]
# print (subject, predicate, object), context, prefix_func, index
# #DEBUG
# type error: Argument 1 has incompatible type "Tuple[Node, Node, Node]"; expected "Tuple[str, str, str]"
# type error: Argument 2 has incompatible type "Optional[Graph]"; expected "Optional[str]"
prefix = bb("^".join(prefix_func((subject, predicate, object), context))) # type: ignore[arg-type]
return index, prefix, from_key, results_from_key
def to_key_func(i: int) -> _ToKeyFunc:
def to_key(triple: Tuple[bytes, bytes, bytes], context: bytes) -> bytes:
"Takes a string; returns key"
return "^".encode("latin-1").join(
(
context,
triple[i % 3],
triple[(i + 1) % 3],
triple[(i + 2) % 3],
"".encode("latin-1"),
)
) # "" to tac on the trailing ^
return to_key
def from_key_func(i: int) -> _FromKeyFunc:
def from_key(key: bytes) -> Tuple[bytes, bytes, bytes, bytes]:
"Takes a key; returns string"
parts = key.split("^".encode("latin-1"))
return (
parts[0],
parts[(3 - i + 0) % 3 + 1],
parts[(3 - i + 1) % 3 + 1],
parts[(3 - i + 2) % 3 + 1],
)
return from_key
def results_from_key_func(
i: int, from_string: Callable[[bytes], Node]
) -> _ResultsFromKeyFunc:
def from_key(
key: bytes,
subject: Optional[Node],
predicate: Optional[Node],
object: Optional[Node],
contexts_value: bytes,
) -> Tuple[Tuple[Node, Node, Node], Generator[Node, None, None]]:
"Takes a key and subject, predicate, object; returns tuple for yield"
parts = key.split("^".encode("latin-1"))
if subject is None:
# TODO: i & 1: # dis assemble and/or measure to see which is faster
# subject is None or i & 1
s = from_string(parts[(3 - i + 0) % 3 + 1])
else:
s = subject
if predicate is None: # i & 2:
p = from_string(parts[(3 - i + 1) % 3 + 1])
else:
p = predicate
if object is None: # i & 4:
o = from_string(parts[(3 - i + 2) % 3 + 1])
else:
o = object
return (
(s, p, o),
(from_string(c) for c in contexts_value.split("^".encode("latin-1")) if c),
)
return from_key
# TODO: Remove unused
def readable_index(i: int) -> str:
# type error: Unpacking a string is disallowed
s, p, o = "?" * 3 # type: ignore[misc]
if i & 1:
s = "s"
if i & 2:
p = "p"
if i & 4:
o = "o"
return "%s,%s,%s" % (s, p, o)
@@ -0,0 +1,95 @@
from threading import Lock
class ResponsibleGenerator:
"""A generator that will help clean up when it is done being used."""
__slots__ = ["cleanup", "gen"]
def __init__(self, gen, cleanup):
self.cleanup = cleanup
self.gen = gen
def __del__(self):
self.cleanup()
def __iter__(self):
return self
def __next__(self):
return next(self.gen)
class ConcurrentStore:
def __init__(self, store):
self.store = store
# number of calls to visit still in progress
self.__visit_count = 0
# lock for locking down the indices
self.__lock = Lock()
# lists for keeping track of added and removed triples while
# we wait for the lock
self.__pending_removes = []
self.__pending_adds = []
def add(self, triple):
(s, p, o) = triple
if self.__visit_count == 0:
self.store.add((s, p, o))
else:
self.__pending_adds.append((s, p, o))
def remove(self, triple):
(s, p, o) = triple
if self.__visit_count == 0:
self.store.remove((s, p, o))
else:
self.__pending_removes.append((s, p, o))
def triples(self, triple):
(su, pr, ob) = triple
g = self.store.triples((su, pr, ob))
pending_removes = self.__pending_removes
self.__begin_read()
for s, p, o in ResponsibleGenerator(g, self.__end_read):
if not (s, p, o) in pending_removes: # noqa: E713
yield s, p, o
for s, p, o in self.__pending_adds:
if (
(su is None or su == s)
and (pr is None or pr == p)
and (ob is None or ob == o)
):
yield s, p, o
def __len__(self):
return self.store.__len__()
def __begin_read(self):
lock = self.__lock
lock.acquire()
self.__visit_count = self.__visit_count + 1
lock.release()
def __end_read(self):
lock = self.__lock
lock.acquire()
self.__visit_count = self.__visit_count - 1
if self.__visit_count == 0:
pending_removes = self.__pending_removes
while pending_removes:
(s, p, o) = pending_removes.pop()
try:
self.store.remove((s, p, o))
except: # noqa: E722
# TODO: change to try finally?
print(s, p, o, "Not in store to remove")
pending_adds = self.__pending_adds
while pending_adds:
(s, p, o) = pending_adds.pop()
self.store.add((s, p, o))
lock.release()
@@ -0,0 +1,737 @@
#
#
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
Collection,
Dict,
Generator,
Iterator,
Mapping,
Optional,
Set,
Tuple,
Union,
overload,
)
from rdflib.store import Store
from rdflib.util import _coalesce
if TYPE_CHECKING:
from rdflib.graph import (
Graph,
_ContextType,
_ObjectType,
_PredicateType,
_SubjectType,
_TriplePatternType,
_TripleType,
)
from rdflib.plugins.sparql.sparql import Query, Update
from rdflib.query import Result
from rdflib.term import Identifier, URIRef
__all__ = ["SimpleMemory", "Memory"]
ANY: None = None
class SimpleMemory(Store):
"""\
A fast naive in memory implementation of a triple store.
This triple store uses nested dictionaries to store triples. Each
triple is stored in two such indices as follows spo[s][p][o] = 1 and
pos[p][o][s] = 1.
Authors: Michel Pelletier, Daniel Krech, Stefan Niederhauser
"""
def __init__(
self,
configuration: Optional[str] = None,
identifier: Optional[Identifier] = None,
):
super(SimpleMemory, self).__init__(configuration)
self.identifier = identifier
# indexed by [subject][predicate][object]
self.__spo: Dict[_SubjectType, Dict[_PredicateType, Dict[_ObjectType, int]]] = (
{}
)
# indexed by [predicate][object][subject]
self.__pos: Dict[_PredicateType, Dict[_ObjectType, Dict[_SubjectType, int]]] = (
{}
)
# indexed by [predicate][object][subject]
self.__osp: Dict[_ObjectType, Dict[_SubjectType, Dict[_PredicateType, int]]] = (
{}
)
self.__namespace: Dict[str, URIRef] = {}
self.__prefix: Dict[URIRef, str] = {}
def add(
self,
triple: _TripleType,
context: _ContextType,
quoted: bool = False,
) -> None:
"""\
Add a triple to the store of triples.
"""
# add dictionary entries for spo[s][p][p] = 1 and pos[p][o][s]
# = 1, creating the nested dictionaries where they do not yet
# exits.
subject, predicate, object = triple
spo = self.__spo
try:
po = spo[subject]
except: # noqa: E722
po = spo[subject] = {}
try:
o = po[predicate]
except: # noqa: E722
o = po[predicate] = {}
o[object] = 1
pos = self.__pos
try:
os = pos[predicate]
except: # noqa: E722
os = pos[predicate] = {}
try:
s = os[object]
except: # noqa: E722
s = os[object] = {}
s[subject] = 1
osp = self.__osp
try:
sp = osp[object]
except: # noqa: E722
sp = osp[object] = {}
try:
p = sp[subject]
except: # noqa: E722
p = sp[subject] = {}
p[predicate] = 1
def remove(
self,
triple_pattern: _TriplePatternType,
context: Optional[_ContextType] = None,
) -> None:
for (subject, predicate, object), c in list(self.triples(triple_pattern)):
del self.__spo[subject][predicate][object]
del self.__pos[predicate][object][subject]
del self.__osp[object][subject][predicate]
def triples(
self,
triple_pattern: _TriplePatternType,
context: Optional[_ContextType] = None,
) -> Iterator[Tuple[_TripleType, Iterator[Optional[_ContextType]]]]:
"""A generator over all the triples matching"""
subject, predicate, object = triple_pattern
if subject != ANY: # subject is given
spo = self.__spo
if subject in spo:
subjectDictionary = spo[subject] # noqa: N806
if predicate != ANY: # subject+predicate is given
if predicate in subjectDictionary:
if object != ANY: # subject+predicate+object is given
if object in subjectDictionary[predicate]:
yield (subject, predicate, object), self.__contexts()
else: # given object not found
pass
else: # subject+predicate is given, object unbound
for o in subjectDictionary[predicate].keys():
yield (subject, predicate, o), self.__contexts()
else: # given predicate not found
pass
else: # subject given, predicate unbound
for p in subjectDictionary.keys():
if object != ANY: # object is given
if object in subjectDictionary[p]:
yield (subject, p, object), self.__contexts()
else: # given object not found
pass
else: # object unbound
for o in subjectDictionary[p].keys():
yield (subject, p, o), self.__contexts()
else: # given subject not found
pass
elif predicate != ANY: # predicate is given, subject unbound
pos = self.__pos
if predicate in pos:
predicateDictionary = pos[predicate] # noqa: N806
if object != ANY: # predicate+object is given, subject unbound
if object in predicateDictionary:
for s in predicateDictionary[object].keys():
yield (s, predicate, object), self.__contexts()
else: # given object not found
pass
else: # predicate is given, object+subject unbound
for o in predicateDictionary.keys():
for s in predicateDictionary[o].keys():
yield (s, predicate, o), self.__contexts()
elif object != ANY: # object is given, subject+predicate unbound
osp = self.__osp
if object in osp:
objectDictionary = osp[object] # noqa: N806
for s in objectDictionary.keys():
for p in objectDictionary[s].keys():
yield (s, p, object), self.__contexts()
else: # subject+predicate+object unbound
spo = self.__spo
for s in spo.keys():
subjectDictionary = spo[s] # noqa: N806
for p in subjectDictionary.keys():
for o in subjectDictionary[p].keys():
yield (s, p, o), self.__contexts()
def __len__(self, context: Optional[_ContextType] = None) -> int:
# @@ optimize
i = 0
for triple in self.triples((None, None, None)):
i += 1
return i
def bind(self, prefix: str, namespace: URIRef, override: bool = True) -> None:
# should be identical to `Memory.bind`
bound_namespace = self.__namespace.get(prefix)
bound_prefix = _coalesce(
self.__prefix.get(namespace),
# type error: error: Argument 1 to "get" of "Mapping" has incompatible type "Optional[URIRef]"; expected "URIRef"
self.__prefix.get(bound_namespace), # type: ignore[arg-type]
)
if override:
if bound_prefix is not None:
del self.__namespace[bound_prefix]
if bound_namespace is not None:
del self.__prefix[bound_namespace]
self.__prefix[namespace] = prefix
self.__namespace[prefix] = namespace
else:
# type error: Invalid index type "Optional[URIRef]" for "Dict[URIRef, str]"; expected type "URIRef"
self.__prefix[_coalesce(bound_namespace, namespace)] = _coalesce( # type: ignore[index]
bound_prefix, default=prefix
)
# type error: Invalid index type "Optional[str]" for "Dict[str, URIRef]"; expected type "str"
self.__namespace[_coalesce(bound_prefix, prefix)] = _coalesce( # type: ignore[index]
bound_namespace, default=namespace
)
def namespace(self, prefix: str) -> Optional[URIRef]:
return self.__namespace.get(prefix, None)
def prefix(self, namespace: URIRef) -> Optional[str]:
return self.__prefix.get(namespace, None)
def namespaces(self) -> Iterator[Tuple[str, URIRef]]:
for prefix, namespace in self.__namespace.items():
yield prefix, namespace
def __contexts(self) -> Generator[_ContextType, None, None]:
# TODO: best way to return empty generator
# type error: Need type annotation for "c"
return (c for c in []) # type: ignore[var-annotated]
# type error: Missing return statement
def query( # type: ignore[return]
self,
query: Union[Query, str],
initNs: Mapping[str, Any], # noqa: N803
initBindings: Mapping[str, Identifier], # noqa: N803
queryGraph: str, # noqa: N803
**kwargs: Any,
) -> Result:
super(SimpleMemory, self).query(
query, initNs, initBindings, queryGraph, **kwargs
)
def update(
self,
update: Union[Update, str],
initNs: Mapping[str, Any], # noqa: N803
initBindings: Mapping[str, Identifier], # noqa: N803
queryGraph: str, # noqa: N803
**kwargs: Any,
) -> None:
super(SimpleMemory, self).update(
update, initNs, initBindings, queryGraph, **kwargs
)
class Memory(Store):
"""\
An in memory implementation of a triple store.
Same as SimpleMemory above, but is Context-aware, Graph-aware, and Formula-aware
Authors: Ashley Sommer
"""
context_aware = True
formula_aware = True
graph_aware = True
def __init__(
self,
configuration: Optional[str] = None,
identifier: Optional[Identifier] = None,
):
super(Memory, self).__init__(configuration)
self.identifier = identifier
# indexed by [subject][predicate][object]
self.__spo: Dict[_SubjectType, Dict[_PredicateType, Dict[_ObjectType, int]]] = (
{}
)
# indexed by [predicate][object][subject]
self.__pos: Dict[_PredicateType, Dict[_ObjectType, Dict[_SubjectType, int]]] = (
{}
)
# indexed by [predicate][object][subject]
self.__osp: Dict[_ObjectType, Dict[_SubjectType, Dict[_PredicateType, int]]] = (
{}
)
self.__namespace: Dict[str, URIRef] = {}
self.__prefix: Dict[URIRef, str] = {}
self.__context_obj_map: Dict[str, Graph] = {}
self.__tripleContexts: Dict[_TripleType, Dict[Optional[str], bool]] = {}
self.__contextTriples: Dict[Optional[str], Set[_TripleType]] = {None: set()}
# all contexts used in store (unencoded)
self.__all_contexts: Set[Graph] = set()
# default context information for triples
self.__defaultContexts: Optional[Dict[Optional[str], bool]] = None
def add(
self,
triple: _TripleType,
context: _ContextType,
quoted: bool = False,
) -> None:
"""\
Add a triple to the store of triples.
"""
# add dictionary entries for spo[s][p][p] = 1 and pos[p][o][s]
# = 1, creating the nested dictionaries where they do not yet
# exits.
Store.add(self, triple, context, quoted=quoted)
if context is not None:
self.__all_contexts.add(context)
subject, predicate, object_ = triple
spo = self.__spo
try:
po = spo[subject]
except LookupError:
po = spo[subject] = {}
try:
o = po[predicate]
except LookupError:
o = po[predicate] = {}
try:
_ = o[object_]
# This cannot be reached if (s, p, o) was not inserted before.
triple_exists = True
except KeyError:
o[object_] = 1
triple_exists = False
self.__add_triple_context(triple, triple_exists, context, quoted)
if triple_exists:
# No need to insert twice this triple.
return
pos = self.__pos
try:
os = pos[predicate]
except LookupError:
os = pos[predicate] = {}
try:
s = os[object_]
except LookupError:
s = os[object_] = {}
s[subject] = 1
osp = self.__osp
try:
sp = osp[object_]
except LookupError:
sp = osp[object_] = {}
try:
p = sp[subject]
except LookupError:
p = sp[subject] = {}
p[predicate] = 1
def remove(
self,
triple_pattern: _TriplePatternType,
context: Optional[_ContextType] = None,
) -> None:
req_ctx = self.__ctx_to_str(context)
for triple, c in self.triples(triple_pattern, context=context):
subject, predicate, object_ = triple
for ctx in self.__get_context_for_triple(triple):
if context is not None and req_ctx != ctx:
continue
self.__remove_triple_context(triple, ctx)
ctxs = self.__get_context_for_triple(triple, skipQuoted=True)
if None in ctxs and (context is None or len(ctxs) == 1):
# remove from default graph too
self.__remove_triple_context(triple, None)
if len(self.__get_context_for_triple(triple)) == 0:
del self.__spo[subject][predicate][object_]
del self.__pos[predicate][object_][subject]
del self.__osp[object_][subject][predicate]
del self.__tripleContexts[triple]
if (
req_ctx is not None
and req_ctx in self.__contextTriples
and len(self.__contextTriples[req_ctx]) == 0
):
# all triples are removed out of this context
# and it's not the default context so delete it
del self.__contextTriples[req_ctx]
if (
triple_pattern == (None, None, None)
and context in self.__all_contexts
and not self.graph_aware
):
# remove the whole context
self.__all_contexts.remove(context)
def triples(
self,
triple_pattern: _TriplePatternType,
context: Optional[_ContextType] = None,
) -> Generator[
Tuple[_TripleType, Generator[Optional[_ContextType], None, None]],
None,
None,
]:
"""A generator over all the triples matching"""
req_ctx = self.__ctx_to_str(context)
subject, predicate, object_ = triple_pattern
# all triples case (no triple parts given as pattern)
if subject is None and predicate is None and object_ is None:
# Just dump all known triples from the given graph
if req_ctx not in self.__contextTriples:
return
for triple in self.__contextTriples[req_ctx].copy():
yield triple, self.__contexts(triple)
# optimize "triple in graph" case (all parts given)
elif subject is not None and predicate is not None and object_ is not None:
# type error: Incompatible types in assignment (expression has type "Tuple[Optional[IdentifiedNode], Optional[IdentifiedNode], Optional[Identifier]]", variable has type "Tuple[IdentifiedNode, IdentifiedNode, Identifier]")
# NOTE on type error: at this point, all elements of triple_pattern
# is not None, so it has the same type as triple
triple = triple_pattern # type: ignore[assignment]
try:
_ = self.__spo[subject][predicate][object_]
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
except KeyError:
return
elif subject is not None: # subject is given
spo = self.__spo
if subject in spo:
subjectDictionary = spo[subject] # noqa: N806
if predicate is not None: # subject+predicate is given
if predicate in subjectDictionary:
if object_ is not None: # subject+predicate+object is given
if object_ in subjectDictionary[predicate]:
triple = (subject, predicate, object_)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
else: # given object not found
pass
else: # subject+predicate is given, object unbound
for o in list(subjectDictionary[predicate].keys()):
triple = (subject, predicate, o)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
else: # given predicate not found
pass
else: # subject given, predicate unbound
for p in list(subjectDictionary.keys()):
if object_ is not None: # object is given
if object_ in subjectDictionary[p]:
triple = (subject, p, object_)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
else: # given object not found
pass
else: # object unbound
for o in list(subjectDictionary[p].keys()):
triple = (subject, p, o)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
else: # given subject not found
pass
elif predicate is not None: # predicate is given, subject unbound
pos = self.__pos
if predicate in pos:
predicateDictionary = pos[predicate] # noqa: N806
if object_ is not None: # predicate+object is given, subject unbound
if object_ in predicateDictionary:
for s in list(predicateDictionary[object_].keys()):
triple = (s, predicate, object_)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
else: # given object not found
pass
else: # predicate is given, object+subject unbound
for o in list(predicateDictionary.keys()):
for s in list(predicateDictionary[o].keys()):
triple = (s, predicate, o)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
elif object_ is not None: # object is given, subject+predicate unbound
osp = self.__osp
if object_ in osp:
objectDictionary = osp[object_] # noqa: N806
for s in list(objectDictionary.keys()):
for p in list(objectDictionary[s].keys()):
triple = (s, p, object_)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
else: # subject+predicate+object unbound
# Shouldn't get here if all other cases above worked correctly.
spo = self.__spo
for s in list(spo.keys()):
subjectDictionary = spo[s] # noqa: N806
for p in list(subjectDictionary.keys()):
for o in list(subjectDictionary[p].keys()):
triple = (s, p, o)
if self.__triple_has_context(triple, req_ctx):
yield triple, self.__contexts(triple)
def bind(self, prefix: str, namespace: URIRef, override: bool = True) -> None:
# should be identical to `SimpleMemory.bind`
bound_namespace = self.__namespace.get(prefix)
bound_prefix = _coalesce(
self.__prefix.get(namespace),
# type error: error: Argument 1 to "get" of "Mapping" has incompatible type "Optional[URIRef]"; expected "URIRef"
self.__prefix.get(bound_namespace), # type: ignore[arg-type]
)
if override:
if bound_prefix is not None:
del self.__namespace[bound_prefix]
if bound_namespace is not None:
del self.__prefix[bound_namespace]
self.__prefix[namespace] = prefix
self.__namespace[prefix] = namespace
else:
# type error: Invalid index type "Optional[URIRef]" for "Dict[URIRef, str]"; expected type "URIRef"
self.__prefix[_coalesce(bound_namespace, namespace)] = _coalesce( # type: ignore[index]
bound_prefix, default=prefix
)
# type error: Invalid index type "Optional[str]" for "Dict[str, URIRef]"; expected type "str"
# type error: Incompatible types in assignment (expression has type "Optional[URIRef]", target has type "URIRef")
self.__namespace[_coalesce(bound_prefix, prefix)] = _coalesce( # type: ignore[index]
bound_namespace, default=namespace
)
def namespace(self, prefix: str) -> Optional[URIRef]:
return self.__namespace.get(prefix, None)
def prefix(self, namespace: URIRef) -> Optional[str]:
return self.__prefix.get(namespace, None)
def namespaces(self) -> Iterator[Tuple[str, URIRef]]:
for prefix, namespace in self.__namespace.items():
yield prefix, namespace
def contexts(
self, triple: Optional[_TripleType] = None
) -> Generator[_ContextType, None, None]:
if triple is None or triple == (None, None, None):
return (context for context in self.__all_contexts)
subj, pred, obj = triple
try:
_ = self.__spo[subj][pred][obj]
return self.__contexts(triple)
except KeyError:
return (_ for _ in [])
def __len__(self, context: Optional[_ContextType] = None) -> int:
ctx = self.__ctx_to_str(context)
if ctx not in self.__contextTriples:
return 0
return len(self.__contextTriples[ctx])
def add_graph(self, graph: Graph) -> None:
if not self.graph_aware:
Store.add_graph(self, graph)
else:
self.__all_contexts.add(graph)
def remove_graph(self, graph: Graph) -> None:
if not self.graph_aware:
Store.remove_graph(self, graph)
else:
self.remove((None, None, None), graph)
try:
self.__all_contexts.remove(graph)
except KeyError:
pass # we didn't know this graph, no problem
# internal utility methods below
def __add_triple_context(
self,
triple: _TripleType,
triple_exists: bool,
context: Optional[_ContextType],
quoted: bool,
) -> None:
"""add the given context to the set of contexts for the triple"""
ctx = self.__ctx_to_str(context)
quoted = bool(quoted)
if triple_exists:
# we know the triple exists somewhere in the store
try:
triple_context = self.__tripleContexts[triple]
except KeyError:
# triple exists with default ctx info
# start with a copy of the default ctx info
# type error: Item "None" of "Optional[Dict[Optional[str], bool]]" has no attribute "copy"
triple_context = self.__tripleContexts[triple] = (
self.__defaultContexts.copy() # type: ignore[union-attr]
)
triple_context[ctx] = quoted
if not quoted:
triple_context[None] = quoted
else:
# the triple didn't exist before in the store
if quoted: # this context only
triple_context = self.__tripleContexts[triple] = {ctx: quoted}
else: # default context as well
triple_context = self.__tripleContexts[triple] = {
ctx: quoted,
None: quoted,
}
# if the triple is not quoted add it to the default context
if not quoted:
self.__contextTriples[None].add(triple)
# always add the triple to given context, making sure it's initialized
if ctx not in self.__contextTriples:
self.__contextTriples[ctx] = set()
self.__contextTriples[ctx].add(triple)
# if this is the first ever triple in the store, set default ctx info
if self.__defaultContexts is None:
self.__defaultContexts = triple_context
# if the context info is the same as default, no need to store it
if triple_context == self.__defaultContexts:
del self.__tripleContexts[triple]
def __get_context_for_triple(
self, triple: _TripleType, skipQuoted: bool = False # noqa: N803
) -> Collection[Optional[str]]:
"""return a list of contexts (str) for the triple, skipping
quoted contexts if skipQuoted==True"""
ctxs = self.__tripleContexts.get(triple, self.__defaultContexts)
if not skipQuoted:
# type error: Item "None" of "Optional[Dict[Optional[str], bool]]" has no attribute "keys"
return ctxs.keys() # type: ignore[union-attr]
# type error: Item "None" of "Optional[Dict[Optional[str], bool]]" has no attribute "items"
return [ctx for ctx, quoted in ctxs.items() if not quoted] # type: ignore[union-attr]
def __triple_has_context(self, triple: _TripleType, ctx: Optional[str]) -> bool:
"""return True if the triple exists in the given context"""
# type error: Unsupported right operand type for in ("Optional[Dict[Optional[str], bool]]")
return ctx in self.__tripleContexts.get(triple, self.__defaultContexts) # type: ignore[operator]
def __remove_triple_context(self, triple: _TripleType, ctx):
"""remove the context from the triple"""
# type error: Item "None" of "Optional[Dict[Optional[str], bool]]" has no attribute "copy"
ctxs = self.__tripleContexts.get(triple, self.__defaultContexts).copy() # type: ignore[union-attr]
del ctxs[ctx]
if ctxs == self.__defaultContexts:
del self.__tripleContexts[triple]
else:
self.__tripleContexts[triple] = ctxs
self.__contextTriples[ctx].remove(triple)
@overload
def __ctx_to_str(self, ctx: _ContextType) -> str: ...
@overload
def __ctx_to_str(self, ctx: None) -> None: ...
def __ctx_to_str(self, ctx: Optional[_ContextType]) -> Optional[str]:
if ctx is None:
return None
try:
# ctx could be a graph. In that case, use its identifier
ctx_str = "{}:{}".format(ctx.identifier.__class__.__name__, ctx.identifier)
self.__context_obj_map[ctx_str] = ctx
return ctx_str
except AttributeError:
# otherwise, ctx should be a URIRef or BNode or str
# NOTE on type errors: This is actually never called with ctx value as str in all unit tests, so this seems like it should just not be here.
# type error: Subclass of "Graph" and "str" cannot exist: would have incompatible method signatures
if isinstance(ctx, str): # type: ignore[unreachable]
# type error: Statement is unreachable
ctx_str = "{}:{}".format(ctx.__class__.__name__, ctx) # type: ignore[unreachable]
if ctx_str in self.__context_obj_map:
return ctx_str
self.__context_obj_map[ctx_str] = ctx
return ctx_str
raise RuntimeError("Cannot use that type of object as a Graph context")
def __contexts(self, triple: _TripleType) -> Generator[_ContextType, None, None]:
"""return a generator for all the non-quoted contexts
(dereferenced) the encoded triple appears in"""
# type error: Argument 2 to "get" of "Mapping" has incompatible type "str"; expected "Optional[Graph]"
return (
self.__context_obj_map.get(ctx_str, ctx_str) # type: ignore[arg-type]
for ctx_str in self.__get_context_for_triple(triple, skipQuoted=True)
if ctx_str is not None
)
# type error: Missing return statement
def query( # type: ignore[return]
self,
query: Union[Query, str],
initNs: Mapping[str, Any], # noqa: N803
initBindings: Mapping[str, Identifier], # noqa: N803
queryGraph: str, # noqa: N803
**kwargs,
) -> Result:
super(Memory, self).query(query, initNs, initBindings, queryGraph, **kwargs)
def update(
self,
update: Union[Update, Any],
initNs: Mapping[str, Any], # noqa: N803
initBindings: Mapping[str, Identifier], # noqa: N803
queryGraph: str, # noqa: N803
**kwargs,
) -> None:
super(Memory, self).update(update, initNs, initBindings, queryGraph, **kwargs)
@@ -0,0 +1,174 @@
"""
This wrapper intercepts calls through the store interface which make use of
the REGEXTerm class to represent matches by REGEX instead of literal
comparison.
Implemented for stores that don't support this and essentially
provides the support by replacing the REGEXTerms by wildcards (None) and
matching against the results from the store it's wrapping.
"""
import re
from rdflib.graph import Graph
from rdflib.store import Store
# Store is capable of doing its own REGEX matching
NATIVE_REGEX = 0
# Store uses Python's re module internally for REGEX matching
PYTHON_REGEX = 1
class REGEXTerm(str):
"""
REGEXTerm can be used in any term slot and is interpreted as a request to
perform a REGEX match (not a string comparison) using the value
(pre-compiled) for checking rdf:type matches
"""
def __init__(self, expr):
self.compiledExpr = re.compile(expr)
def __reduce__(self):
return (REGEXTerm, ("",))
def regexCompareQuad(quad, regexQuad): # noqa: N802, N803
for index in range(4):
if isinstance(regexQuad[index], REGEXTerm) and not regexQuad[
index
].compiledExpr.match(quad[index]):
return False
return True
class REGEXMatching(Store):
def __init__(self, storage):
self.storage = storage
self.context_aware = storage.context_aware
# NOTE: this store can't be formula_aware as it doesn't have enough
# info to reverse the removal of a quoted statement.
self.formula_aware = storage.formula_aware
self.transaction_aware = storage.transaction_aware
def open(self, configuration, create=True):
return self.storage.open(configuration, create)
def close(self, commit_pending_transaction=False):
self.storage.close()
def destroy(self, configuration):
self.storage.destroy(configuration)
def add(self, triple, context, quoted=False):
(subject, predicate, object_) = triple
self.storage.add((subject, predicate, object_), context, quoted)
def remove(self, triple, context=None):
(subject, predicate, object_) = triple
if (
isinstance(subject, REGEXTerm)
or isinstance(predicate, REGEXTerm)
or isinstance(object_, REGEXTerm)
or (context is not None and isinstance(context.identifier, REGEXTerm))
):
# One or more of the terms is a REGEX expression, so we must
# replace it / them with wildcard(s)and match after we query.
s = not isinstance(subject, REGEXTerm) and subject or None
p = not isinstance(predicate, REGEXTerm) and predicate or None
o = not isinstance(object_, REGEXTerm) and object_ or None
c = (
(context is not None and not isinstance(context.identifier, REGEXTerm))
and context
or None
)
removeQuadList = [] # noqa: N806
for (s1, p1, o1), cg in self.storage.triples((s, p, o), c):
for ctx in cg:
ctx = ctx.identifier
if regexCompareQuad(
(s1, p1, o1, ctx),
(
subject,
predicate,
object_,
context is not None and context.identifier or context,
),
):
removeQuadList.append((s1, p1, o1, ctx))
for s, p, o, c in removeQuadList:
self.storage.remove((s, p, o), c and Graph(self, c) or c)
else:
self.storage.remove((subject, predicate, object_), context)
def triples(self, triple, context=None):
(subject, predicate, object_) = triple
if (
isinstance(subject, REGEXTerm)
or isinstance(predicate, REGEXTerm)
or isinstance(object_, REGEXTerm)
or (context is not None and isinstance(context.identifier, REGEXTerm))
):
# One or more of the terms is a REGEX expression, so we must
# replace it / them with wildcard(s) and match after we query.
s = not isinstance(subject, REGEXTerm) and subject or None
p = not isinstance(predicate, REGEXTerm) and predicate or None
o = not isinstance(object_, REGEXTerm) and object_ or None
c = (
(context is not None and not isinstance(context.identifier, REGEXTerm))
and context
or None
)
for (s1, p1, o1), cg in self.storage.triples((s, p, o), c):
matchingCtxs = [] # noqa: N806
for ctx in cg:
if c is None:
if context is None or context.identifier.compiledExpr.match(
ctx.identifier
):
matchingCtxs.append(ctx)
else:
matchingCtxs.append(ctx)
if matchingCtxs and regexCompareQuad(
(s1, p1, o1, None), (subject, predicate, object_, None)
):
yield (s1, p1, o1), (c for c in matchingCtxs)
else:
for (s1, p1, o1), cg in self.storage.triples(
(subject, predicate, object_), context
):
yield (s1, p1, o1), cg
def __len__(self, context=None):
# NOTE: If the context is a REGEX this could be an expensive
# proposition
return self.storage.__len__(context)
def contexts(self, triple=None):
# NOTE: There is no way to control REGEX matching for this method at
# this level as it only returns the contexts, not the matching
# triples.
for ctx in self.storage.contexts(triple):
yield ctx
def remove_context(self, identifier):
self.storage.remove((None, None, None), identifier)
def bind(self, prefix, namespace, override=True):
self.storage.bind(prefix, namespace, override=override)
def prefix(self, namespace):
return self.storage.prefix(namespace)
def namespace(self, prefix):
return self.storage.namespace(prefix)
def namespaces(self):
return self.storage.namespaces()
def commit(self):
self.storage.commit()
def rollback(self):
self.storage.rollback()
@@ -0,0 +1,192 @@
from __future__ import annotations
import base64
import copy
import logging
from io import BytesIO
from typing import TYPE_CHECKING, Optional, Tuple
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from rdflib.query import Result
from rdflib.term import BNode
log = logging.getLogger(__name__)
if TYPE_CHECKING:
import typing_extensions as te
class SPARQLConnectorException(Exception): # noqa: N818
pass
# TODO: Pull in these from the result implementation plugins?
_response_mime_types = {
"xml": "application/sparql-results+xml, application/rdf+xml",
"json": "application/sparql-results+json",
"csv": "text/csv",
"tsv": "text/tab-separated-values",
"application/rdf+xml": "application/rdf+xml",
}
class SPARQLConnector:
"""
this class deals with nitty gritty details of talking to a SPARQL server
"""
def __init__(
self,
query_endpoint: Optional[str] = None,
update_endpoint: Optional[str] = None,
returnFormat: str = "xml", # noqa: N803
method: te.Literal["GET", "POST", "POST_FORM"] = "GET",
auth: Optional[Tuple[str, str]] = None,
**kwargs,
):
"""
auth, if present, must be a tuple of (username, password) used for Basic Authentication
Any additional keyword arguments will be passed to to the request, and can be used to setup timeouts etc.
"""
self._method: str
self.returnFormat = returnFormat
self.query_endpoint = query_endpoint
self.update_endpoint = update_endpoint
self.kwargs = kwargs
self.method = method
if auth is not None:
if type(auth) is not tuple:
raise SPARQLConnectorException("auth must be a tuple")
if len(auth) != 2:
raise SPARQLConnectorException("auth must be a tuple (user, password)")
base64string = base64.b64encode(bytes("%s:%s" % auth, "ascii"))
self.kwargs.setdefault("headers", {})
self.kwargs["headers"].update(
{"Authorization": "Basic %s" % base64string.decode("utf-8")}
)
@property
def method(self) -> str:
return self._method
@method.setter
def method(self, method: str) -> None:
if method not in ("GET", "POST", "POST_FORM"):
raise SPARQLConnectorException(
'Method must be "GET", "POST", or "POST_FORM"'
)
self._method = method
def query(
self,
query: str,
default_graph: Optional[str] = None,
named_graph: Optional[str] = None,
) -> Result:
if not self.query_endpoint:
raise SPARQLConnectorException("Query endpoint not set!")
params = {}
# this test ensures we don't have a useless (BNode) default graph URI, which calls to Graph().query() will add
if default_graph is not None and type(default_graph) is not BNode:
params["default-graph-uri"] = default_graph
headers = {"Accept": _response_mime_types[self.returnFormat]}
args = copy.deepcopy(self.kwargs)
# merge params/headers dicts
args.setdefault("params", {})
args.setdefault("headers", {})
args["headers"].update(headers)
if self.method == "GET":
params["query"] = query
args["params"].update(params)
qsa = "?" + urlencode(args["params"])
try:
res = urlopen(
Request(self.query_endpoint + qsa, headers=args["headers"])
)
except Exception as e: # noqa: F841
raise ValueError(
"You did something wrong formulating either the URI or your SPARQL query"
)
elif self.method == "POST":
args["headers"].update({"Content-Type": "application/sparql-query"})
args["params"].update(params)
qsa = "?" + urlencode(args["params"])
try:
res = urlopen(
Request(
self.query_endpoint + qsa,
data=query.encode(),
headers=args["headers"],
)
)
except HTTPError as e:
# type error: Incompatible return value type (got "Tuple[int, str, None]", expected "Result")
return e.code, str(e), None # type: ignore[return-value]
elif self.method == "POST_FORM":
params["query"] = query
args["params"].update(params)
try:
res = urlopen(
Request(
self.query_endpoint,
data=urlencode(args["params"]).encode(),
headers=args["headers"],
)
)
except HTTPError as e:
# type error: Incompatible return value type (got "Tuple[int, str, None]", expected "Result")
return e.code, str(e), None # type: ignore[return-value]
else:
raise SPARQLConnectorException("Unknown method %s" % self.method)
return Result.parse(
BytesIO(res.read()), content_type=res.headers["Content-Type"].split(";")[0]
)
def update(
self,
query: str,
default_graph: Optional[str] = None,
named_graph: Optional[str] = None,
) -> None:
if not self.update_endpoint:
raise SPARQLConnectorException("Query endpoint not set!")
params = {}
if default_graph is not None:
params["using-graph-uri"] = default_graph
if named_graph is not None:
params["using-named-graph-uri"] = named_graph
headers = {
"Accept": _response_mime_types[self.returnFormat],
"Content-Type": "application/sparql-update; charset=UTF-8",
}
args = copy.deepcopy(self.kwargs) # other QSAs
args.setdefault("params", {})
args["params"].update(params)
args.setdefault("headers", {})
args["headers"].update(headers)
qsa = "?" + urlencode(args["params"])
res = urlopen( # noqa: F841
Request(
self.update_endpoint + qsa, data=query.encode(), headers=args["headers"]
)
)
__all__ = ["SPARQLConnector", "SPARQLConnectorException"]
@@ -0,0 +1,999 @@
"""
This is an RDFLib store around Ivan Herman et al.'s SPARQL service wrapper.
This was first done in layer-cake, and then ported to RDFLib
"""
from __future__ import annotations
import collections
import re
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
Iterable,
Iterator,
List,
Mapping,
Optional,
Tuple,
Union,
overload,
)
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Graph
from rdflib.plugins.stores.regexmatching import NATIVE_REGEX
from rdflib.store import Store
from rdflib.term import BNode, Identifier, Node, URIRef, Variable
if TYPE_CHECKING:
import typing_extensions as te # noqa: I001
from rdflib.graph import (
_TripleType,
_ContextType,
_QuadType,
_TriplePatternType,
_SubjectType,
_PredicateType,
_ObjectType,
_ContextIdentifierType,
)
from rdflib.plugins.sparql.sparql import Query, Update
from rdflib.query import Result, ResultRow
from .sparqlconnector import SPARQLConnector
# Defines some SPARQL keywords
LIMIT = "LIMIT"
OFFSET = "OFFSET"
ORDERBY = "ORDER BY"
BNODE_IDENT_PATTERN = re.compile(r"(?P<label>_\:[^\s]+)")
_NodeToSparql = Callable[["Node"], str]
def _node_to_sparql(node: Node) -> str:
if isinstance(node, BNode):
raise Exception(
"SPARQLStore does not support BNodes! "
"See http://www.w3.org/TR/sparql11-query/#BGPsparqlBNodes"
)
return node.n3()
class SPARQLStore(SPARQLConnector, Store):
"""An RDFLib store around a SPARQL endpoint
This is context-aware and should work as expected
when a context is specified.
For ConjunctiveGraphs, reading is done from the "default graph". Exactly
what this means depends on your endpoint, because SPARQL does not offer a
simple way to query the union of all graphs as it would be expected for a
ConjuntiveGraph. This is why we recommend using Dataset instead, which is
motivated by the SPARQL 1.1.
Fuseki/TDB has a flag for specifying that the default graph
is the union of all graphs (``tdb:unionDefaultGraph`` in the Fuseki config).
.. warning:: By default the SPARQL Store does not support blank-nodes!
As blank-nodes act as variables in SPARQL queries,
there is no way to query for a particular blank node without
using non-standard SPARQL extensions.
See http://www.w3.org/TR/sparql11-query/#BGPsparqlBNodes
You can make use of such extensions through the ``node_to_sparql``
argument. For example if you want to transform BNode('0001') into
"<bnode:b0001>", you can use a function like this:
>>> def my_bnode_ext(node):
... if isinstance(node, BNode):
... return '<bnode:b%s>' % node
... return _node_to_sparql(node)
>>> store = SPARQLStore('http://dbpedia.org/sparql',
... node_to_sparql=my_bnode_ext)
You can request a particular result serialization with the
``returnFormat`` parameter. This is a string that must have a
matching plugin registered. Built in is support for ``xml``,
``json``, ``csv``, ``tsv`` and ``application/rdf+xml``.
The underlying SPARQLConnector uses the urllib library.
Any extra kwargs passed to the SPARQLStore connector are passed to
urllib when doing HTTP calls. I.e. you have full control of
cookies/auth/headers.
Form example:
>>> store = SPARQLStore('...my endpoint ...', auth=('user','pass'))
will use HTTP basic auth.
"""
formula_aware = False
transaction_aware = False
graph_aware = True
regex_matching = NATIVE_REGEX
def __init__(
self,
query_endpoint: Optional[str] = None,
sparql11: bool = True,
context_aware: bool = True,
node_to_sparql: _NodeToSparql = _node_to_sparql,
returnFormat: str = "xml", # noqa: N803
auth: Optional[Tuple[str, str]] = None,
**sparqlconnector_kwargs,
):
super(SPARQLStore, self).__init__(
query_endpoint=query_endpoint,
returnFormat=returnFormat,
auth=auth,
**sparqlconnector_kwargs,
)
self.node_to_sparql = node_to_sparql
self.nsBindings: Dict[str, Any] = {}
self.sparql11 = sparql11
self.context_aware = context_aware
self.graph_aware = context_aware
self._queries = 0
# type error: Missing return statement
def open(self, configuration: str, create: bool = False) -> Optional[int]: # type: ignore[return]
"""This method is included so that calls to this Store via Graph, e.g. Graph("SPARQLStore"),
can set the required parameters
"""
if type(configuration) == str: # noqa: E721
self.query_endpoint = configuration
else:
raise Exception(
"configuration must be a string (a single query endpoint URI)"
)
# Database Management Methods
def create(self, configuration: str) -> None:
raise TypeError(
"The SPARQL Store is read only. Try SPARQLUpdateStore for read/write."
)
def destroy(self, configuration: str) -> None:
raise TypeError("The SPARQL store is read only")
# Transactional interfaces
def commit(self) -> None:
raise TypeError("The SPARQL store is read only")
def rollback(self) -> None:
raise TypeError("The SPARQL store is read only")
def add(
self, _: _TripleType, context: _ContextType = None, quoted: bool = False
) -> None:
raise TypeError("The SPARQL store is read only")
def addN(self, quads: Iterable[_QuadType]) -> None: # noqa: N802
raise TypeError("The SPARQL store is read only")
# type error: Signature of "remove" incompatible with supertype "Store"
def remove( # type: ignore[override]
self, _: _TriplePatternType, context: Optional[_ContextType]
) -> None:
raise TypeError("The SPARQL store is read only")
# type error: Signature of "update" incompatible with supertype "SPARQLConnector"
def update( # type: ignore[override]
self,
query: Union[Update, str],
initNs: Dict[str, Any] = {}, # noqa: N803
initBindings: Dict[str, Identifier] = {}, # noqa: N803
queryGraph: Identifier = None, # noqa: N803
DEBUG: bool = False, # noqa: N803
) -> None:
raise TypeError("The SPARQL store is read only")
def _query(self, *args: Any, **kwargs: Any) -> Result:
self._queries += 1
return super(SPARQLStore, self).query(*args, **kwargs)
def _inject_prefixes(self, query: str, extra_bindings: Mapping[str, Any]) -> str:
bindings = set(list(self.nsBindings.items()) + list(extra_bindings.items()))
if not bindings:
return query
return "\n".join(
[
"\n".join(["PREFIX %s: <%s>" % (k, v) for k, v in bindings]),
"", # separate ns_bindings from query with an empty line
query,
]
)
# type error: Signature of "query" incompatible with supertype "SPARQLConnector"
# type error: Signature of "query" incompatible with supertype "Store"
def query( # type: ignore[override]
self,
query: Union[Query, str],
initNs: Optional[Mapping[str, Any]] = None, # noqa: N803
initBindings: Optional[Mapping[str, Identifier]] = None, # noqa: N803
queryGraph: Optional[str] = None, # noqa: N803
DEBUG: bool = False, # noqa: N803
) -> Result:
self.debug = DEBUG
assert isinstance(query, str)
if initNs is not None and len(initNs) > 0:
query = self._inject_prefixes(query, initNs)
if initBindings:
if not self.sparql11:
raise Exception("initBindings not supported for SPARQL 1.0 Endpoints.")
v = list(initBindings)
# VALUES was added to SPARQL 1.1 on 2012/07/24
query += "\nVALUES ( %s )\n{ ( %s ) }\n" % (
" ".join("?" + str(x) for x in v),
" ".join(self.node_to_sparql(initBindings[x]) for x in v),
)
return self._query(
query, default_graph=queryGraph if self._is_contextual(queryGraph) else None
)
# type error: Return type "Iterator[Tuple[Tuple[Node, Node, Node], None]]" of "triples" incompatible with return type "Iterator[Tuple[Tuple[Node, Node, Node], Iterator[Optional[Graph]]]]"
def triples( # type: ignore[override]
self, spo: _TriplePatternType, context: Optional[_ContextType] = None
) -> Iterator[Tuple[_TripleType, None]]:
"""
- tuple **(s, o, p)**
the triple used as filter for the SPARQL select.
(None, None, None) means anything.
- context **context**
the graph effectively calling this method.
Returns a tuple of triples executing essentially a SPARQL like
SELECT ?subj ?pred ?obj WHERE { ?subj ?pred ?obj }
**context** may include three parameter
to refine the underlying query:
* LIMIT: an integer to limit the number of results
* OFFSET: an integer to enable paging of results
* ORDERBY: an instance of Variable('s'), Variable('o') or Variable('p') or, by default, the first 'None' from the given triple
.. warning::
- Using LIMIT or OFFSET automatically include ORDERBY otherwise this is
because the results are retrieved in a not deterministic way (depends on
the walking path on the graph)
- Using OFFSET without defining LIMIT will discard the first OFFSET - 1 results
.. code-block:: python
a_graph.LIMIT = limit
a_graph.OFFSET = offset
triple_generator = a_graph.triples(mytriple):
# do something
# Removes LIMIT and OFFSET if not required for the next triple() calls
del a_graph.LIMIT
del a_graph.OFFSET
"""
s, p, o = spo
vars = []
if not s:
s = Variable("s")
vars.append(s)
if not p:
p = Variable("p")
vars.append(p)
if not o:
o = Variable("o")
vars.append(o)
if vars:
v = " ".join([term.n3() for term in vars])
verb = "SELECT %s " % v
else:
verb = "ASK"
nts = self.node_to_sparql
query = "%s { %s %s %s }" % (verb, nts(s), nts(p), nts(o))
# The ORDER BY is necessary
if (
hasattr(context, LIMIT)
or hasattr(context, OFFSET)
or hasattr(context, ORDERBY)
):
var = None
if isinstance(s, Variable):
var = s
elif isinstance(p, Variable):
var = p
elif isinstance(o, Variable):
var = o
elif hasattr(context, ORDERBY) and isinstance(
getattr(context, ORDERBY), Variable
):
var = getattr(context, ORDERBY)
# type error: Item "None" of "Optional[Variable]" has no attribute "n3"
query = query + " %s %s" % (ORDERBY, var.n3()) # type: ignore[union-attr]
try:
query = query + " LIMIT %s" % int(getattr(context, LIMIT))
except (ValueError, TypeError, AttributeError):
pass
try:
query = query + " OFFSET %s" % int(getattr(context, OFFSET))
except (ValueError, TypeError, AttributeError):
pass
result = self._query(
query,
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
default_graph=context.identifier if self._is_contextual(context) else None, # type: ignore[union-attr]
)
if vars:
if type(result) is tuple:
if result[0] == 401:
raise ValueError(
"It looks like you need to authenticate with this SPARQL Store. HTTP unauthorized"
)
for row in result:
if TYPE_CHECKING:
# This will be a ResultRow because if vars is truthish then
# the query will be a SELECT query.
assert isinstance(row, ResultRow)
yield (
# type error: No overload variant of "get" of "ResultRow" matches argument types "Node", "Node"
row.get(s, s), # type: ignore[call-overload]
row.get(p, p), # type: ignore[call-overload]
row.get(o, o), # type: ignore[call-overload]
), None # why is the context here not the passed in graph 'context'?
else:
if result.askAnswer:
yield (s, p, o), None
def triples_choices(
self,
_: Tuple[
Union[_SubjectType, List[_SubjectType]],
Union[_PredicateType, List[_PredicateType]],
Union[_ObjectType, List[_ObjectType]],
],
context: Optional[_ContextType] = None,
) -> Generator[
Tuple[
Tuple[_SubjectType, _PredicateType, _ObjectType],
Iterator[Optional[_ContextType]],
],
None,
None,
]:
"""
A variant of triples that can take a list of terms instead of a
single term in any slot. Stores can implement this to optimize
the response time from the import default 'fallback' implementation,
which will iterate over each term in the list and dispatch to
triples.
"""
raise NotImplementedError("Triples choices currently not supported")
def __len__(self, context: Optional[_ContextType] = None) -> int:
if not self.sparql11:
raise NotImplementedError(
"For performance reasons, this is not"
+ "supported for sparql1.0 endpoints"
)
else:
q = "SELECT (count(*) as ?c) WHERE {?s ?p ?o .}"
result = self._query(
q,
# type error: Item "None" of "Optional[Graph]" has no attribute "identifier"
default_graph=(
context.identifier # type: ignore[union-attr]
if self._is_contextual(context)
else None
),
)
# type error: Item "Tuple[Node, ...]" of "Union[Tuple[Node, Node, Node], bool, ResultRow]" has no attribute "c"
return int(next(iter(result)).c) # type: ignore[union-attr]
# type error: Return type "Generator[Identifier, None, None]" of "contexts" incompatible with return type "Generator[Graph, None, None]" in supertype "Store"
def contexts( # type: ignore[override]
self, triple: Optional[_TripleType] = None
) -> Generator[_ContextIdentifierType, None, None]:
"""
Iterates over results to "SELECT ?NAME { GRAPH ?NAME { ?s ?p ?o } }"
or "SELECT ?NAME { GRAPH ?NAME {} }" if triple is `None`.
Returns instances of this store with the SPARQL wrapper
object updated via addNamedGraph(?NAME).
This causes a named-graph-uri key / value pair to be sent over
the protocol.
Please note that some SPARQL endpoints are not able to find empty named
graphs.
"""
if triple:
nts = self.node_to_sparql
s, p, o = triple
params = (
nts(s if s else Variable("s")),
nts(p if p else Variable("p")),
nts(o if o else Variable("o")),
)
q = "SELECT ?name WHERE { GRAPH ?name { %s %s %s }}" % params
else:
q = "SELECT ?name WHERE { GRAPH ?name {} }"
result = self._query(q)
# type error: Item "bool" of "Union[Tuple[Node, Node, Node], bool, ResultRow]" has no attribute "name"
# error: Generator has incompatible item type "Union[Any, Identifier]"; expected "IdentifiedNode"
return (row.name for row in result) # type: ignore[union-attr,misc]
# Namespace persistence interface implementation
def bind(self, prefix: str, namespace: URIRef, override: bool = True) -> None:
bound_prefix = self.prefix(namespace)
if override and bound_prefix:
del self.nsBindings[bound_prefix]
self.nsBindings[prefix] = namespace
def prefix(self, namespace: URIRef) -> Optional[str]:
""" """
return dict([(v, k) for k, v in self.nsBindings.items()]).get(namespace)
def namespace(self, prefix: str) -> Optional[URIRef]:
return self.nsBindings.get(prefix)
def namespaces(self) -> Iterator[Tuple[str, URIRef]]:
for prefix, ns in self.nsBindings.items():
yield prefix, ns
def add_graph(self, graph: Graph) -> None:
raise TypeError("The SPARQL store is read only")
def remove_graph(self, graph: Graph) -> None:
raise TypeError("The SPARQL store is read only")
@overload
def _is_contextual(self, graph: None) -> te.Literal[False]: ...
@overload
def _is_contextual(self, graph: Optional[Union[Graph, str]]) -> bool: ...
def _is_contextual(self, graph: Optional[Union[Graph, str]]) -> bool:
"""Returns `True` if the "GRAPH" keyword must appear
in the final SPARQL query sent to the endpoint.
"""
if (not self.context_aware) or (graph is None):
return False
if isinstance(graph, str):
return graph != "__UNION__"
else:
return graph.identifier != DATASET_DEFAULT_GRAPH_ID
def subjects(
self,
predicate: Optional[_PredicateType] = None,
object: Optional[_ObjectType] = None,
) -> Generator[_SubjectType, None, None]:
"""A generator of subjects with the given predicate and object"""
for t, c in self.triples((None, predicate, object)):
yield t[0]
def predicates(
self,
subject: Optional[_SubjectType] = None,
object: Optional[_ObjectType] = None,
) -> Generator[_PredicateType, None, None]:
"""A generator of predicates with the given subject and object"""
for t, c in self.triples((subject, None, object)):
yield t[1]
def objects(
self,
subject: Optional[_SubjectType] = None,
predicate: Optional[_PredicateType] = None,
) -> Generator[_ObjectType, None, None]:
"""A generator of objects with the given subject and predicate"""
for t, c in self.triples((subject, predicate, None)):
yield t[2]
def subject_predicates(
self, object: Optional[_ObjectType] = None
) -> Generator[Tuple[_SubjectType, _PredicateType], None, None]:
"""A generator of (subject, predicate) tuples for the given object"""
for t, c in self.triples((None, None, object)):
yield t[0], t[1]
def subject_objects(
self, predicate: Optional[_PredicateType] = None
) -> Generator[Tuple[_SubjectType, _ObjectType], None, None]:
"""A generator of (subject, object) tuples for the given predicate"""
for t, c in self.triples((None, predicate, None)):
yield t[0], t[2]
def predicate_objects(
self, subject: Optional[_SubjectType] = None
) -> Generator[Tuple[_PredicateType, _ObjectType], None, None]:
"""A generator of (predicate, object) tuples for the given subject"""
for t, c in self.triples((subject, None, None)):
yield t[1], t[2]
class SPARQLUpdateStore(SPARQLStore):
"""A store using SPARQL queries for reading and SPARQL Update for changes.
This can be context-aware, if so, any changes will be to the given named
graph only.
In favor of the SPARQL 1.1 motivated Dataset, we advise against using this
with ConjunctiveGraphs, as it reads and writes from and to the
"default graph". Exactly what this means depends on the endpoint and can
result in confusion.
For Graph objects, everything works as expected.
See the :class:`SPARQLStore` base class for more information.
"""
where_pattern = re.compile(r"""(?P<where>WHERE\s*\{)""", re.IGNORECASE)
##############################################################
# Regex for injecting GRAPH blocks into updates on a context #
##############################################################
# Observations on the SPARQL grammar (http://www.w3.org/TR/2013/REC-sparql11-query-20130321/):
# 1. Only the terminals STRING_LITERAL1, STRING_LITERAL2,
# STRING_LITERAL_LONG1, STRING_LITERAL_LONG2, and comments can contain
# curly braces.
# 2. The non-terminals introduce curly braces in pairs only.
# 3. Unescaped " can occur only in strings and comments.
# 3. Unescaped ' can occur only in strings, comments, and IRIRefs.
# 4. \ always escapes the following character, especially \", \', and
# \\ denote literal ", ', and \ respectively.
# 5. # always starts a comment outside of string and IRI
# 6. A comment ends at the next newline
# 7. IRIREFs need to be detected, as they may contain # without starting a comment
# 8. PrefixedNames do not contain a #
# As a consequence, it should be rather easy to detect strings and comments
# in order to avoid unbalanced curly braces.
# From the SPARQL grammar
STRING_LITERAL1 = "'([^'\\\\]|\\\\.)*'"
STRING_LITERAL2 = '"([^"\\\\]|\\\\.)*"'
STRING_LITERAL_LONG1 = "'''(('|'')?([^'\\\\]|\\\\.))*'''"
STRING_LITERAL_LONG2 = '"""(("|"")?([^"\\\\]|\\\\.))*"""'
String = "(%s)|(%s)|(%s)|(%s)" % (
STRING_LITERAL1,
STRING_LITERAL2,
STRING_LITERAL_LONG1,
STRING_LITERAL_LONG2,
)
IRIREF = '<([^<>"{}|^`\\]\\\\[\\x00-\\x20])*>'
COMMENT = "#[^\\x0D\\x0A]*([\\x0D\\x0A]|\\Z)"
# Simplified grammar to find { at beginning and } at end of blocks
BLOCK_START = "{"
BLOCK_END = "}"
ESCAPED = "\\\\."
# Match anything that doesn't start or end a block:
BlockContent = "(%s)|(%s)|(%s)|(%s)" % (String, IRIREF, COMMENT, ESCAPED)
BlockFinding = "(?P<block_start>%s)|(?P<block_end>%s)|(?P<block_content>%s)" % (
BLOCK_START,
BLOCK_END,
BlockContent,
)
BLOCK_FINDING_PATTERN = re.compile(BlockFinding)
# Note that BLOCK_FINDING_PATTERN.finditer() will not cover the whole
# string with matches. Everything that is not matched will have to be
# part of the modified query as is.
##################################################################
def __init__(
self,
query_endpoint: Optional[str] = None,
update_endpoint: Optional[str] = None,
sparql11: bool = True,
context_aware: bool = True,
postAsEncoded: bool = True, # noqa: N803
autocommit: bool = True,
dirty_reads: bool = False,
**kwds,
):
"""
:param autocommit if set, the store will commit after every
writing operations. If False, we only make queries on the
server once commit is called.
:param dirty_reads if set, we do not commit before reading. So you
cannot read what you wrote before manually calling commit.
"""
SPARQLStore.__init__(
self,
query_endpoint,
sparql11,
context_aware,
update_endpoint=update_endpoint,
**kwds,
)
self.postAsEncoded = postAsEncoded
self.autocommit = autocommit
self.dirty_reads = dirty_reads
self._edits: Optional[List[str]] = None
self._updates = 0
def query(self, *args: Any, **kwargs: Any) -> Result:
if not self.autocommit and not self.dirty_reads:
self.commit()
return SPARQLStore.query(self, *args, **kwargs)
# type error: Signature of "triples" incompatible with supertype "Store"
def triples( # type: ignore[override]
self, *args: Any, **kwargs: Any
) -> Iterator[Tuple[_TripleType, None]]:
if not self.autocommit and not self.dirty_reads:
self.commit()
return SPARQLStore.triples(self, *args, **kwargs)
# type error: Signature of "contexts" incompatible with supertype "Store"
def contexts( # type: ignore[override]
self, *args: Any, **kwargs: Any
) -> Generator[_ContextIdentifierType, None, None]:
if not self.autocommit and not self.dirty_reads:
self.commit()
return SPARQLStore.contexts(self, *args, **kwargs)
def __len__(self, *args: Any, **kwargs: Any) -> int:
if not self.autocommit and not self.dirty_reads:
self.commit()
return SPARQLStore.__len__(self, *args, **kwargs)
def open(
self, configuration: Union[str, Tuple[str, str]], create: bool = False
) -> None:
"""
sets the endpoint URLs for this SPARQLStore
:param configuration: either a tuple of (query_endpoint, update_endpoint),
or a string with the endpoint which is configured as query and update endpoint
:param create: if True an exception is thrown.
"""
if create:
raise Exception("Cannot create a SPARQL Endpoint")
if isinstance(configuration, tuple):
self.query_endpoint = configuration[0]
if len(configuration) > 1:
self.update_endpoint = configuration[1]
else:
self.query_endpoint = configuration
self.update_endpoint = configuration
def _transaction(self) -> List[str]:
if self._edits is None:
self._edits = []
return self._edits
# Transactional interfaces
def commit(self) -> None:
"""add(), addN(), and remove() are transactional to reduce overhead of many small edits.
Read and update() calls will automatically commit any outstanding edits.
This should behave as expected most of the time, except that alternating writes
and reads can degenerate to the original call-per-triple situation that originally existed.
"""
if self._edits and len(self._edits) > 0:
self._update("\n;\n".join(self._edits))
self._edits = None
def rollback(self) -> None:
self._edits = None
def add(
self,
spo: _TripleType,
context: Optional[_ContextType] = None,
quoted: bool = False,
) -> None:
"""Add a triple to the store of triples."""
if not self.update_endpoint:
raise Exception("UpdateEndpoint is not set")
assert not quoted
(subject, predicate, obj) = spo
nts = self.node_to_sparql
triple = "%s %s %s ." % (nts(subject), nts(predicate), nts(obj))
if self._is_contextual(context):
if TYPE_CHECKING:
# _is_contextual will never return true if context is None
assert context is not None
q = "INSERT DATA { GRAPH %s { %s } }" % (nts(context.identifier), triple)
else:
q = "INSERT DATA { %s }" % triple
self._transaction().append(q)
if self.autocommit:
self.commit()
def addN(self, quads: Iterable[_QuadType]) -> None: # noqa: N802
"""Add a list of quads to the store."""
if not self.update_endpoint:
raise Exception("UpdateEndpoint is not set - call 'open'")
contexts = collections.defaultdict(list)
for subject, predicate, obj, context in quads:
contexts[context].append((subject, predicate, obj))
data: List[str] = []
nts = self.node_to_sparql
for context in contexts:
triples = [
"%s %s %s ." % (nts(subject), nts(predicate), nts(obj))
for subject, predicate, obj in contexts[context]
]
data.append(
"INSERT DATA { GRAPH %s { %s } }\n"
% (nts(context.identifier), "\n".join(triples))
)
self._transaction().extend(data)
if self.autocommit:
self.commit()
# type error: Signature of "remove" incompatible with supertype "Store"
def remove( # type: ignore[override]
self, spo: _TriplePatternType, context: Optional[_ContextType]
) -> None:
"""Remove a triple from the store"""
if not self.update_endpoint:
raise Exception("UpdateEndpoint is not set - call 'open'")
(subject, predicate, obj) = spo
if not subject:
subject = Variable("S")
if not predicate:
predicate = Variable("P")
if not obj:
obj = Variable("O")
nts = self.node_to_sparql
triple = "%s %s %s ." % (nts(subject), nts(predicate), nts(obj))
if self._is_contextual(context):
if TYPE_CHECKING:
# _is_contextual will never return true if context is None
assert context is not None
cid = nts(context.identifier)
q = "WITH %(graph)s DELETE { %(triple)s } WHERE { %(triple)s }" % {
"graph": cid,
"triple": triple,
}
else:
q = "DELETE { %s } WHERE { %s } " % (triple, triple)
self._transaction().append(q)
if self.autocommit:
self.commit()
def setTimeout(self, timeout) -> None: # noqa: N802
self._timeout = int(timeout)
def _update(self, update):
self._updates += 1
SPARQLConnector.update(self, update)
# type error: Signature of "update" incompatible with supertype "SPARQLConnector"
# type error: Signature of "update" incompatible with supertype "Store"
def update( # type: ignore[override]
self,
query: Union[Update, str],
initNs: Dict[str, Any] = {}, # noqa: N803
initBindings: Dict[str, Identifier] = {}, # noqa: N803
queryGraph: Optional[str] = None, # noqa: N803
DEBUG: bool = False, # noqa: N803
):
"""
Perform a SPARQL Update Query against the endpoint,
INSERT, LOAD, DELETE etc.
Setting initNs adds PREFIX declarations to the beginning of
the update. Setting initBindings adds inline VALUEs to the
beginning of every WHERE clause. By the SPARQL grammar, all
operations that support variables (namely INSERT and DELETE)
require a WHERE clause.
Important: initBindings fails if the update contains the
substring 'WHERE {' which does not denote a WHERE clause, e.g.
if it is part of a literal.
.. admonition:: Context-aware query rewriting
- **When:** If context-awareness is enabled and the graph is not the default graph of the store.
- **Why:** To ensure consistency with the :class:`~rdflib.plugins.stores.memory.Memory` store.
The graph must accept "local" SPARQL requests (requests with no GRAPH keyword)
as if it was the default graph.
- **What is done:** These "local" queries are rewritten by this store.
The content of each block of a SPARQL Update operation is wrapped in a GRAPH block
except if the block is empty.
This basically causes INSERT, INSERT DATA, DELETE, DELETE DATA and WHERE to operate
only on the context.
- **Example:** ``"INSERT DATA { <urn:michel> <urn:likes> <urn:pizza> }"`` is converted into
``"INSERT DATA { GRAPH <urn:graph> { <urn:michel> <urn:likes> <urn:pizza> } }"``.
- **Warning:** Queries are presumed to be "local" but this assumption is **not checked**.
For instance, if the query already contains GRAPH blocks, the latter will be wrapped in new GRAPH blocks.
- **Warning:** A simplified grammar is used that should tolerate
extensions of the SPARQL grammar. Still, the process may fail in
uncommon situations and produce invalid output.
"""
if not self.update_endpoint:
raise Exception("Update endpoint is not set!")
self.debug = DEBUG
assert isinstance(query, str)
query = self._inject_prefixes(query, initNs)
if self._is_contextual(queryGraph):
if TYPE_CHECKING:
# _is_contextual will never return true if context is None
assert queryGraph is not None
query = self._insert_named_graph(query, queryGraph)
if initBindings:
# For INSERT and DELETE the WHERE clause is obligatory
# (http://www.w3.org/TR/2013/REC-sparql11-query-20130321/#rModify)
# Other query types do not allow variables and don't
# have a WHERE clause. This also works for updates with
# more than one INSERT/DELETE.
v = list(initBindings)
values = "\nVALUES ( %s )\n{ ( %s ) }\n" % (
" ".join("?" + str(x) for x in v),
" ".join(self.node_to_sparql(initBindings[x]) for x in v),
)
query = self.where_pattern.sub("WHERE { " + values, query)
self._transaction().append(query)
if self.autocommit:
self.commit()
def _insert_named_graph(self, query: str, query_graph: str) -> str:
"""
Inserts GRAPH <query_graph> {} into blocks of SPARQL Update operations
For instance, "INSERT DATA { <urn:michel> <urn:likes> <urn:pizza> }"
is converted into
"INSERT DATA { GRAPH <urn:graph> { <urn:michel> <urn:likes> <urn:pizza> } }"
"""
if isinstance(query_graph, Node):
query_graph = self.node_to_sparql(query_graph)
else:
query_graph = "<%s>" % query_graph
graph_block_open = " GRAPH %s {" % query_graph
graph_block_close = "} "
# SPARQL Update supports the following operations:
# LOAD, CLEAR, DROP, ADD, MOVE, COPY, CREATE, INSERT DATA, DELETE DATA, DELETE/INSERT, DELETE WHERE
# LOAD, CLEAR, DROP, ADD, MOVE, COPY, CREATE do not make much sense in a context.
# INSERT DATA, DELETE DATA, and DELETE WHERE require the contents of their block to be wrapped in a GRAPH <?> { }.
# DELETE/INSERT supports the WITH keyword, which sets the graph to be
# used for all following DELETE/INSERT instruction including the
# non-optional WHERE block. Equivalently, a GRAPH block can be added to
# all blocks.
#
# Strategy employed here: Wrap the contents of every top-level block into a `GRAPH <?> { }`.
level = 0
modified_query = []
pos = 0
for match in self.BLOCK_FINDING_PATTERN.finditer(query):
if match.group("block_start") is not None:
level += 1
if level == 1:
modified_query.append(query[pos : match.end()])
modified_query.append(graph_block_open)
pos = match.end()
elif match.group("block_end") is not None:
if level == 1:
since_previous_pos = query[pos : match.start()]
if modified_query[-1] is graph_block_open and (
since_previous_pos == "" or since_previous_pos.isspace()
):
# In this case, adding graph_block_start and
# graph_block_end results in an empty GRAPH block. Some
# endpoints (e.g. TDB) can not handle this. Therefore
# remove the previously added block_start.
modified_query.pop()
modified_query.append(since_previous_pos)
else:
modified_query.append(since_previous_pos)
modified_query.append(graph_block_close)
pos = match.start()
level -= 1
modified_query.append(query[pos:])
return "".join(modified_query)
def add_graph(self, graph: Graph) -> None:
if not self.graph_aware:
Store.add_graph(self, graph)
elif graph.identifier != DATASET_DEFAULT_GRAPH_ID:
self.update("CREATE GRAPH %s" % self.node_to_sparql(graph.identifier))
def remove_graph(self, graph: Graph) -> None:
if not self.graph_aware:
Store.remove_graph(self, graph)
elif graph.identifier == DATASET_DEFAULT_GRAPH_ID:
self.update("DROP DEFAULT")
else:
self.update("DROP GRAPH %s" % self.node_to_sparql(graph.identifier))
def subjects(
self,
predicate: Optional[_PredicateType] = None,
object: Optional[_ObjectType] = None,
) -> Generator[_SubjectType, None, None]:
"""A generator of subjects with the given predicate and object"""
for t, c in self.triples((None, predicate, object)):
yield t[0]
def predicates(
self,
subject: Optional[_SubjectType] = None,
object: Optional[_ObjectType] = None,
) -> Generator[_PredicateType, None, None]:
"""A generator of predicates with the given subject and object"""
for t, c in self.triples((subject, None, object)):
yield t[1]
def objects(
self,
subject: Optional[_SubjectType] = None,
predicate: Optional[_PredicateType] = None,
) -> Generator[_ObjectType, None, None]:
"""A generator of objects with the given subject and predicate"""
for t, c in self.triples((subject, predicate, None)):
yield t[2]
def subject_predicates(
self, object: Optional[_ObjectType] = None
) -> Generator[Tuple[_SubjectType, _PredicateType], None, None]:
"""A generator of (subject, predicate) tuples for the given object"""
for t, c in self.triples((None, None, object)):
yield t[0], t[1]
def subject_objects(
self, predicate: Optional[_PredicateType] = None
) -> Generator[Tuple[_SubjectType, _ObjectType], None, None]:
"""A generator of (subject, object) tuples for the given predicate"""
for t, c in self.triples((None, predicate, None)):
yield t[0], t[2]
def predicate_objects(
self, subject: Optional[_SubjectType] = None
) -> Generator[Tuple[_PredicateType, _ObjectType], None, None]:
"""A generator of (predicate, object) tuples for the given subject"""
for t, c in self.triples((subject, None, None)):
yield t[1], t[2]
__all__ = ["SPARQLUpdateStore", "SPARQLStore"]