2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,19 @@
from rdflib.namespace import RDF # noqa: N999
from rdflib.term import URIRef
class RDFVOC(RDF):
_underscore_num = True
_fail = True
# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
# A mapping from unqualified terms to their qualified version.
RDF: URIRef
Description: URIRef
ID: URIRef
about: URIRef
parseType: URIRef # noqa: N815
resource: URIRef
li: URIRef
nodeID: URIRef # noqa: N815
datatype: URIRef
@@ -0,0 +1,172 @@
"""
This is a rdflib plugin for parsing Hextuple files, which are Newline-Delimited JSON
(ndjson) files, into Conjunctive. The store that backs the graph *must* be able to
handle contexts, i.e. multiple graphs.
"""
from __future__ import annotations
import json
import warnings
from io import TextIOWrapper
from typing import TYPE_CHECKING, Any, BinaryIO, List, Optional, TextIO, Union
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode, Literal, URIRef
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
if TYPE_CHECKING:
from io import BufferedReader
__all__ = ["HextuplesParser"]
class HextuplesParser(Parser):
"""
An RDFLib parser for Hextuples
"""
def __init__(self):
super(HextuplesParser, self).__init__()
self.default_context: Optional[Graph] = None
self.skolemize = False
def _parse_hextuple(
self, ds: Union[Dataset, ConjunctiveGraph], tup: List[Union[str, None]]
) -> None:
# all values check
# subject, predicate, value, datatype cannot be None
# language and graph may be None
if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None:
raise ValueError(
f"subject, predicate, value, datatype cannot be None. Given: {tup}"
)
# 1 - subject
s: Union[URIRef, BNode]
if tup[0].startswith("_"):
s = BNode(value=tup[0].replace("_:", ""))
if self.skolemize:
s = s.skolemize()
else:
s = URIRef(tup[0])
# 2 - predicate
p = URIRef(tup[1])
# 3 - value
o: Union[URIRef, BNode, Literal]
if tup[3] == "globalId":
o = URIRef(tup[2])
elif tup[3] == "localId":
o = BNode(value=tup[2].replace("_:", ""))
if self.skolemize:
o = o.skolemize()
else: # literal
if tup[4] is None:
o = Literal(tup[2], datatype=URIRef(tup[3]))
else:
o = Literal(tup[2], lang=tup[4])
# 6 - context
if tup[5] is not None:
c = (
BNode(tup[5].replace("_:", ""))
if tup[5].startswith("_:")
else URIRef(tup[5])
)
if isinstance(c, BNode) and self.skolemize:
c = c.skolemize()
ds.get_context(c).add((s, p, o))
elif self.default_context is not None:
self.default_context.add((s, p, o))
else:
raise Exception("No context to parse into!")
# type error: Signature of "parse" incompatible with supertype "Parser"
def parse(self, source: InputSource, graph: Graph, skolemize: bool = False, **kwargs: Any) -> None: # type: ignore[override]
if kwargs.get("encoding") not in [None, "utf-8"]:
warnings.warn(
f"Hextuples files are always utf-8 encoded, "
f"I was passed: {kwargs.get('encoding')}, "
"but I'm still going to use utf-8"
)
assert (
graph.store.context_aware
), "Hextuples Parser needs a context-aware store!"
self.skolemize = skolemize
# Set default_union to True to mimic ConjunctiveGraph behavior
ds = Dataset(store=graph.store, default_union=True)
ds_default = ds.default_context # the DEFAULT_DATASET_GRAPH_ID
if isinstance(graph, (Dataset, ConjunctiveGraph)):
self.default_context = graph.default_context
elif graph.identifier is not None:
if graph.identifier == ds_default.identifier:
self.default_context = graph
else:
self.default_context = ds.get_context(graph.identifier)
else:
# mypy thinks this is unreachable, but graph.identifier can be None
self.default_context = ds_default # type: ignore[unreachable]
if self.default_context is not ds_default:
ds.default_context = self.default_context
ds.remove_graph(ds_default) # remove the original unused default graph
try:
text_stream: Optional[TextIO] = source.getCharacterStream()
except (AttributeError, LookupError):
text_stream = None
try:
binary_stream: Optional[BinaryIO] = source.getByteStream()
except (AttributeError, LookupError):
binary_stream = None
if text_stream is None and binary_stream is None:
raise ValueError(
f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
)
if TYPE_CHECKING:
assert text_stream is not None or binary_stream is not None
use_stream: Union[TextIO, BinaryIO]
if _HAS_ORJSON:
if binary_stream is not None:
use_stream = binary_stream
else:
if TYPE_CHECKING:
assert isinstance(text_stream, TextIOWrapper)
use_stream = text_stream
loads = orjson.loads
else:
if text_stream is not None:
use_stream = text_stream
else:
if TYPE_CHECKING:
assert isinstance(binary_stream, BufferedReader)
use_stream = TextIOWrapper(binary_stream, encoding="utf-8")
loads = json.loads
for line in use_stream: # type: Union[str, bytes]
if len(line) == 0 or line.isspace():
# Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
# The result is that we accept input that would otherwise be invalid.
# Possibly we should just let this result in an error.
continue
# this complex handing is because the 'value' component is
# allowed to be "" but not None
# all other "" values are treated as None
raw_line: List[str] = loads(line)
hex_tuple_line = [x if x != "" else None for x in raw_line]
if raw_line[2] == "":
hex_tuple_line[2] = ""
self._parse_hextuple(ds, hex_tuple_line)
@@ -0,0 +1,712 @@
"""
This parser will interpret a JSON-LD document as an RDF Graph. See:
http://json-ld.org/
Example usage::
>>> from rdflib import Graph, URIRef, Literal
>>> test_json = '''
... {
... "@context": {
... "dc": "http://purl.org/dc/terms/",
... "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
... "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
... },
... "@id": "http://example.org/about",
... "dc:title": {
... "@language": "en",
... "@value": "Someone's Homepage"
... }
... }
... '''
>>> g = Graph().parse(data=test_json, format='json-ld')
>>> list(g) == [(URIRef('http://example.org/about'),
... URIRef('http://purl.org/dc/terms/title'),
... Literal("Someone's Homepage", lang='en'))]
True
"""
# From: https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/parser.py
# NOTE: This code reads the entire JSON object into memory before parsing, but
# we should consider streaming the input to deal with arbitrarily large graphs.
from __future__ import annotations
import secrets
import warnings
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
import rdflib.parser
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.namespace import RDF, XSD
from rdflib.parser import InputSource, URLInputSource
from rdflib.term import BNode, IdentifiedNode, Literal, Node, URIRef
from ..shared.jsonld.context import UNDEF, Context, Term
from ..shared.jsonld.keys import (
CONTEXT,
GRAPH,
ID,
INCLUDED,
INDEX,
JSON,
LANG,
LIST,
NEST,
NONE,
REV,
SET,
TYPE,
VALUE,
VOCAB,
)
from ..shared.jsonld.util import (
_HAS_ORJSON,
VOCAB_DELIMS,
context_from_urlinputsource,
json,
orjson,
source_to_json,
)
__all__ = ["JsonLDParser", "to_rdf"]
TYPE_TERM = Term(str(RDF.type), TYPE, VOCAB) # type: ignore[call-arg]
ALLOW_LISTS_OF_LISTS = True # NOTE: Not allowed in JSON-LD 1.0
class JsonLDParser(rdflib.parser.Parser):
def __init__(self):
super(JsonLDParser, self).__init__()
def parse(
self,
source: InputSource,
sink: Graph,
version: float = 1.1,
skolemize: bool = False,
encoding: Optional[str] = "utf-8",
base: Optional[str] = None,
context: Optional[
Union[
List[Union[Dict[str, Any], str, None]],
Dict[str, Any],
str,
]
] = None,
generalized_rdf: Optional[bool] = False,
extract_all_scripts: Optional[bool] = False,
**kwargs: Any,
) -> None:
"""Parse JSON-LD from a source document.
The source document can be JSON or HTML with embedded JSON script
elements (type attribute = "application/ld+json"). To process as HTML
``source.content_type`` must be set to "text/html" or
"application/xhtml+xml".
:param source: InputSource with JSON-formatted data (JSON or HTML)
:param sink: Graph to receive the parsed triples
:param version: parse as JSON-LD version, defaults to 1.1
:param encoding: character encoding of the JSON (should be "utf-8"
or "utf-16"), defaults to "utf-8"
:param base: JSON-LD `Base IRI <https://www.w3.org/TR/json-ld/#base-iri>`_, defaults to None
:param context: JSON-LD `Context <https://www.w3.org/TR/json-ld/#the-context>`_, defaults to None
:param generalized_rdf: parse as `Generalized RDF <https://www.w3.org/TR/json-ld/#relationship-to-rdf>`_, defaults to False
:param extract_all_scripts: if source is an HTML document then extract
all script elements, defaults to False (extract only the first
script element). This is ignored if ``source.system_id`` contains
a fragment identifier, in which case only the script element with
matching id attribute is extracted.
"""
if encoding not in ("utf-8", "utf-16"):
warnings.warn(
"JSON should be encoded as unicode. "
"Given encoding was: %s" % encoding
)
if not base:
base = sink.absolutize(source.getPublicId() or source.getSystemId() or "")
context_data = context
if not context_data and hasattr(source, "url") and hasattr(source, "links"):
if TYPE_CHECKING:
assert isinstance(source, URLInputSource)
context_data = context_from_urlinputsource(source)
try:
version = float(version)
except ValueError:
version = 1.1
# Get the optional fragment identifier
try:
fragment_id = URIRef(source.getSystemId()).fragment
except Exception:
fragment_id = None
data, html_base = source_to_json(source, fragment_id, extract_all_scripts)
if html_base is not None:
base = URIRef(html_base, base=base)
# NOTE: A ConjunctiveGraph parses into a Graph sink, so no sink will be
# context_aware. Keeping this check in case RDFLib is changed, or
# someone passes something context_aware to this parser directly.
conj_sink: Graph
if not sink.context_aware:
conj_sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier)
else:
conj_sink = sink
to_rdf(
data,
conj_sink,
base,
context_data,
version,
bool(generalized_rdf),
skolemize=skolemize,
)
def to_rdf(
data: Any,
dataset: Graph,
base: Optional[str] = None,
context_data: Optional[
Union[
List[Union[Dict[str, Any], str, None]],
Dict[str, Any],
str,
]
] = None,
version: Optional[float] = None,
generalized_rdf: bool = False,
allow_lists_of_lists: Optional[bool] = None,
skolemize: bool = False,
):
# TODO: docstring w. args and return value
context = Context(base=base, version=version)
if context_data:
context.load(context_data)
parser = Parser(
generalized_rdf=generalized_rdf,
allow_lists_of_lists=allow_lists_of_lists,
skolemize=skolemize,
)
return parser.parse(data, context, dataset)
class Parser:
def __init__(
self,
generalized_rdf: bool = False,
allow_lists_of_lists: Optional[bool] = None,
skolemize: bool = False,
):
self.skolemize = skolemize
self.generalized_rdf = generalized_rdf
self.allow_lists_of_lists = (
allow_lists_of_lists
if allow_lists_of_lists is not None
else ALLOW_LISTS_OF_LISTS
)
self.invalid_uri_to_bnode: dict[str, BNode] = {}
def parse(self, data: Any, context: Context, dataset: Graph) -> Graph:
topcontext = False
resources: Union[Dict[str, Any], List[Any]]
if isinstance(data, list):
resources = data
elif isinstance(data, dict):
local_context = data.get(CONTEXT)
if local_context:
context.load(local_context, context.base)
topcontext = True
resources = data
# type error: Subclass of "Dict[str, Any]" and "List[Any]" cannot exist: would have incompatible method signatures
if not isinstance(resources, list): # type: ignore[unreachable]
resources = [resources]
if context.vocab:
dataset.bind(None, context.vocab)
for name, term in context.terms.items():
if term.id and term.id.endswith(VOCAB_DELIMS):
dataset.bind(name, term.id)
# type error: "Graph" has no attribute "default_context"
graph = dataset.default_context if dataset.context_aware else dataset # type: ignore[attr-defined]
for node in resources:
self._add_to_graph(dataset, graph, context, node, topcontext)
return graph
def _add_to_graph(
self,
dataset: Graph,
graph: Graph,
context: Context,
node: Any,
topcontext: bool = False,
) -> Optional[Node]:
if not isinstance(node, dict) or context.get_value(node):
# type error: Return value expected
return # type: ignore[return-value]
if CONTEXT in node and not topcontext:
local_context = node[CONTEXT]
if local_context:
context = context.subcontext(local_context)
else:
context = Context(base=context.doc_base)
# type error: Incompatible types in assignment (expression has type "Optional[Context]", variable has type "Context")
context = context.get_context_for_type(node) # type: ignore[assignment]
id_val = context.get_id(node)
if id_val is None:
nested_id = self._get_nested_id(context, node)
if nested_id is not None and len(nested_id) > 0:
id_val = nested_id
if isinstance(id_val, str):
subj = self._to_rdf_id(context, id_val)
else:
subj = BNode()
if self.skolemize:
subj = subj.skolemize()
if subj is None:
return None
# NOTE: crude way to signify that this node might represent a named graph
no_id = id_val is None
for key, obj in node.items():
if key == CONTEXT or key in context.get_keys(ID):
continue
if key == REV or key in context.get_keys(REV):
for rkey, robj in obj.items():
self._key_to_graph(
dataset,
graph,
context,
subj,
rkey,
robj,
reverse=True,
no_id=no_id,
)
else:
self._key_to_graph(dataset, graph, context, subj, key, obj, no_id=no_id)
return subj
# type error: Missing return statement
def _get_nested_id(self, context: Context, node: Dict[str, Any]) -> Optional[str]: # type: ignore[return]
for key, obj in node.items():
if context.version >= 1.1 and key in context.get_keys(NEST):
term = context.terms.get(key)
if term and term.id is None:
continue
objs = obj if isinstance(obj, list) else [obj]
for obj in objs:
if not isinstance(obj, dict):
continue
id_val = context.get_id(obj)
if not id_val:
subcontext = context.get_context_for_term(
context.terms.get(key)
)
id_val = self._get_nested_id(subcontext, obj)
if isinstance(id_val, str):
return id_val
def _key_to_graph(
self,
dataset: Graph,
graph: Graph,
context: Context,
subj: Node,
key: str,
obj: Any,
reverse: bool = False,
no_id: bool = False,
) -> None:
if isinstance(obj, list):
obj_nodes = obj
else:
obj_nodes = [obj]
term = context.terms.get(key)
if term:
term_id = term.id
if term.type == JSON:
obj_nodes = [self._to_typed_json_value(obj)]
elif LIST in term.container:
obj_nodes = [self._expand_nested_list(obj_nodes)]
elif isinstance(obj, dict):
obj_nodes = self._parse_container(context, term, obj)
else:
term_id = None
if TYPE in (key, term_id):
term = TYPE_TERM
if GRAPH in (key, term_id):
if dataset.context_aware and not no_id:
if TYPE_CHECKING:
assert isinstance(dataset, ConjunctiveGraph)
# type error: Argument 1 to "get_context" of "ConjunctiveGraph" has incompatible type "Node"; expected "Union[IdentifiedNode, str, None]"
subgraph = dataset.get_context(subj) # type: ignore[arg-type]
else:
subgraph = graph
for onode in obj_nodes:
self._add_to_graph(dataset, subgraph, context, onode)
return
if SET in (key, term_id):
for onode in obj_nodes:
self._add_to_graph(dataset, graph, context, onode)
return
if INCLUDED in (key, term_id):
for onode in obj_nodes:
self._add_to_graph(dataset, graph, context, onode)
return
if context.version >= 1.1 and key in context.get_keys(NEST):
term = context.terms.get(key)
if term and term.id is None:
return
objs = obj if isinstance(obj, list) else [obj]
for obj in objs:
if not isinstance(obj, dict):
continue
for nkey, nobj in obj.items():
# NOTE: we've already captured subject
if nkey in context.get_keys(ID):
continue
subcontext = context.get_context_for_type(obj)
# type error: Argument 3 to "_key_to_graph" of "Parser" has incompatible type "Optional[Context]"; expected "Context"
self._key_to_graph(dataset, graph, subcontext, subj, nkey, nobj) # type: ignore[arg-type]
return
pred_uri = term.id if term else context.expand(key)
context = context.get_context_for_term(term)
# Flatten deep nested lists
def flatten(n: Iterable[Any]) -> List[Any]:
flattened = []
for obj in n:
if isinstance(obj, dict):
objs = context.get_set(obj)
if objs is not None:
obj = objs
if isinstance(obj, list):
flattened += flatten(obj)
continue
flattened.append(obj)
return flattened
obj_nodes = flatten(obj_nodes)
if not pred_uri:
return
if term and term.reverse:
reverse = not reverse
pred: IdentifiedNode
bid = self._get_bnodeid(pred_uri)
if bid:
if not self.generalized_rdf:
return
pred = BNode(bid)
if self.skolemize:
pred = pred.skolemize()
else:
pred = URIRef(pred_uri)
for obj_node in obj_nodes:
obj = self._to_object(dataset, graph, context, term, obj_node)
if obj is None:
continue
if reverse:
graph.add((obj, pred, subj))
else:
graph.add((subj, pred, obj))
def _parse_container(
self, context: Context, term: Term, obj: Dict[str, Any]
) -> List[Any]:
if LANG in term.container:
obj_nodes = []
for lang, values in obj.items():
if not isinstance(values, list):
values = [values]
if lang in context.get_keys(NONE):
obj_nodes += values
else:
for v in values:
obj_nodes.append((v, lang))
return obj_nodes
v11 = context.version >= 1.1
if v11 and GRAPH in term.container and ID in term.container:
return [
(
dict({GRAPH: o})
if k in context.get_keys(NONE)
else dict({ID: k, GRAPH: o}) if isinstance(o, dict) else o
)
for k, o in obj.items()
]
elif v11 and GRAPH in term.container and INDEX in term.container:
return [dict({GRAPH: o}) for k, o in obj.items()]
elif v11 and GRAPH in term.container:
return [dict({GRAPH: obj})]
elif v11 and ID in term.container:
return [
(
dict({ID: k}, **o)
if isinstance(o, dict) and k not in context.get_keys(NONE)
else o
)
for k, o in obj.items()
]
elif v11 and TYPE in term.container:
return [
(
self._add_type(
context,
(
{ID: context.expand(o) if term.type == VOCAB else o}
if isinstance(o, str)
else o
),
k,
)
if isinstance(o, (dict, str)) and k not in context.get_keys(NONE)
else o
)
for k, o in obj.items()
]
elif INDEX in term.container:
obj_nodes = []
for key, nodes in obj.items():
if not isinstance(nodes, list):
nodes = [nodes]
for node in nodes:
if v11 and term.index and key not in context.get_keys(NONE):
if not isinstance(node, dict):
node = {ID: node}
values = node.get(term.index, [])
if not isinstance(values, list):
values = [values]
values.append(key)
node[term.index] = values
obj_nodes.append(node)
return obj_nodes
return [obj]
@staticmethod
def _add_type(context: Context, o: Dict[str, Any], k: str) -> Dict[str, Any]:
otype = context.get_type(o) or []
if otype and not isinstance(otype, list):
otype = [otype]
otype.append(k)
o[TYPE] = otype
return o
def _to_object(
self,
dataset: Graph,
graph: Graph,
context: Context,
term: Optional[Term],
node: Any,
inlist: bool = False,
) -> Optional[Node]:
if isinstance(node, tuple):
value, lang = node
if value is None:
# type error: Return value expected
return # type: ignore[return-value]
if lang and " " in lang:
# type error: Return value expected
return # type: ignore[return-value]
return Literal(value, lang=lang)
if isinstance(node, dict):
node_list = context.get_list(node)
if node_list is not None:
if inlist and not self.allow_lists_of_lists:
# type error: Return value expected
return # type: ignore[return-value]
listref = self._add_list(dataset, graph, context, term, node_list)
if listref:
return listref
else: # expand compacted value
if term and term.type:
if term.type == JSON:
node = self._to_typed_json_value(node)
elif node is None:
# type error: Return value expected
return # type: ignore[return-value]
elif term.type == ID and isinstance(node, str):
node = {ID: context.resolve(node)}
elif term.type == VOCAB and isinstance(node, str):
node = {ID: context.expand(node) or context.resolve_iri(node)}
else:
node = {TYPE: term.type, VALUE: node}
else:
if node is None:
# type error: Return value expected
return # type: ignore[return-value]
if isinstance(node, float):
return Literal(node, datatype=XSD.double)
if term and term.language is not UNDEF:
lang = term.language
else:
lang = context.language
return Literal(node, lang=lang)
lang = context.get_language(node)
datatype = not lang and context.get_type(node) or None
value = context.get_value(node)
# type error: Unsupported operand types for in ("Optional[Any]" and "Generator[str, None, None]")
if datatype in context.get_keys(JSON): # type: ignore[operator]
node = self._to_typed_json_value(value)
datatype = context.get_type(node)
value = context.get_value(node)
if lang or context.get_key(VALUE) in node or VALUE in node:
if value is None:
return None
if lang:
if " " in lang:
# type error: Return value expected
return # type: ignore[return-value]
return Literal(value, lang=lang)
elif datatype:
return Literal(value, datatype=context.expand(datatype))
else:
return Literal(value)
else:
return self._add_to_graph(dataset, graph, context, node)
def _to_rdf_id(self, context: Context, id_val: str) -> Optional[IdentifiedNode]:
bid = self._get_bnodeid(id_val)
if bid:
b = BNode(bid)
if self.skolemize:
return b.skolemize()
return b
else:
uri = context.resolve(id_val)
if not self.generalized_rdf and ":" not in uri:
return None
node: IdentifiedNode = URIRef(uri)
if not str(node):
if id_val not in self.invalid_uri_to_bnode:
self.invalid_uri_to_bnode[id_val] = BNode(secrets.token_urlsafe(20))
node = self.invalid_uri_to_bnode[id_val]
return node
def _get_bnodeid(self, ref: str) -> Optional[str]:
if not ref.startswith("_:"):
# type error: Return value expected
return # type: ignore[return-value]
bid = ref.split("_:", 1)[-1]
return bid or None
def _add_list(
self,
dataset: Graph,
graph: Graph,
context: Context,
term: Optional[Term],
node_list: Any,
) -> IdentifiedNode:
if not isinstance(node_list, list):
node_list = [node_list]
first_subj: Union[URIRef, BNode] = BNode()
if self.skolemize and isinstance(first_subj, BNode):
first_subj = first_subj.skolemize()
rest: Union[URIRef, BNode, None]
subj, rest = first_subj, None
for node in node_list:
if node is None:
continue
if rest:
# type error: Statement is unreachable
graph.add((subj, RDF.rest, rest)) # type: ignore[unreachable]
subj = rest
obj = self._to_object(dataset, graph, context, term, node, inlist=True)
if obj is None:
continue
graph.add((subj, RDF.first, obj))
rest = BNode()
if self.skolemize and isinstance(rest, BNode):
rest = rest.skolemize()
if rest:
graph.add((subj, RDF.rest, RDF.nil))
return first_subj
else:
return RDF.nil
@staticmethod
def _to_typed_json_value(value: Any) -> Dict[str, str]:
if _HAS_ORJSON:
val_string: str = orjson.dumps(
value,
option=orjson.OPT_SORT_KEYS | orjson.OPT_NON_STR_KEYS,
).decode("utf-8")
else:
val_string = json.dumps(
value, separators=(",", ":"), sort_keys=True, ensure_ascii=False
)
return {
TYPE: RDF.JSON,
VALUE: val_string,
}
@classmethod
def _expand_nested_list(cls, obj_nodes: List[Any]) -> Dict[str, List[Any]]:
result = [
cls._expand_nested_list(o) if isinstance(o, list) else o for o in obj_nodes
]
return {LIST: result}
@@ -0,0 +1,133 @@
"""
This is a rdflib plugin for parsing NQuad files into Conjunctive
graphs that can be used and queried. The store that backs the graph
*must* be able to handle contexts.
>>> from rdflib import ConjunctiveGraph, URIRef, Namespace
>>> g = ConjunctiveGraph()
>>> data = open("test/data/nquads.rdflib/example.nquads", "rb")
>>> g.parse(data, format="nquads") # doctest:+ELLIPSIS
<Graph identifier=... (<class 'rdflib.graph.Graph'>)>
>>> assert len(g.store) == 449
>>> # There should be 16 separate contexts
>>> assert len([x for x in g.store.contexts()]) == 16
>>> # is the name of entity E10009 "Arco Publications"?
>>> # (in graph http://bibliographica.org/entity/E10009)
>>> # Looking for:
>>> # <http://bibliographica.org/entity/E10009>
>>> # <http://xmlns.com/foaf/0.1/name>
>>> # "Arco Publications"
>>> # <http://bibliographica.org/entity/E10009>
>>> s = URIRef("http://bibliographica.org/entity/E10009")
>>> FOAF = Namespace("http://xmlns.com/foaf/0.1/")
>>> assert(g.value(s, FOAF.name).eq("Arco Publications"))
"""
from __future__ import annotations
from codecs import getreader
from typing import Any, MutableMapping, Optional
from rdflib.exceptions import ParserError as ParseError
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
from rdflib.parser import InputSource
# Build up from the NTriples parser:
from rdflib.plugins.parsers.ntriples import W3CNTriplesParser, r_tail, r_wspace
from rdflib.term import BNode
__all__ = ["NQuadsParser"]
_BNodeContextType = MutableMapping[str, BNode]
class NQuadsParser(W3CNTriplesParser):
# type error: Signature of "parse" incompatible with supertype "W3CNTriplesParser"
def parse( # type: ignore[override]
self,
inputsource: InputSource,
sink: Graph,
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
**kwargs: Any,
):
"""
Parse inputsource as an N-Quads file.
:type inputsource: `rdflib.parser.InputSource`
:param inputsource: the source of N-Quads-formatted data
:type sink: `rdflib.graph.Graph`
:param sink: where to send parsed triples
:type bnode_context: `dict`, optional
:param bnode_context: a dict mapping blank node identifiers to `~rdflib.term.BNode` instances.
See `.W3CNTriplesParser.parse`
"""
assert (
sink.store.context_aware
), "NQuadsParser must be given a context-aware store."
# Set default_union to True to mimic ConjunctiveGraph behavior
ds = Dataset(store=sink.store, default_union=True)
ds_default = ds.default_context # the DEFAULT_DATASET_GRAPH_ID
new_default_context = None
if isinstance(sink, (Dataset, ConjunctiveGraph)):
new_default_context = sink.default_context
elif sink.identifier is not None:
if sink.identifier == ds_default.identifier:
new_default_context = sink
else:
new_default_context = ds.get_context(sink.identifier)
if new_default_context is not None:
ds.default_context = new_default_context
ds.remove_graph(ds_default) # remove the original unused default graph
# type error: Incompatible types in assignment (expression has type "ConjunctiveGraph", base class "W3CNTriplesParser" defined the type as "Union[DummySink, NTGraphSink]")
self.sink: Dataset = ds # type: ignore[assignment]
self.skolemize = skolemize
source = inputsource.getCharacterStream()
if not source:
source = inputsource.getByteStream()
source = getreader("utf-8")(source)
if not hasattr(source, "read"):
raise ParseError("Item to parse must be a file-like object.")
self.file = source
self.buffer = ""
while True:
self.line = __line = self.readline()
if self.line is None:
break
try:
self.parseline(bnode_context)
except ParseError as msg:
raise ParseError("Invalid line (%s):\n%r" % (msg, __line))
return self.sink
def parseline(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
self.eat(r_wspace)
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
subject = self.subject(bnode_context)
self.eat(r_wspace)
predicate = self.predicate()
self.eat(r_wspace)
obj = self.object(bnode_context)
self.eat(r_wspace)
context = self.uriref() or self.nodeid(bnode_context)
self.eat(r_tail)
if self.line:
raise ParseError("Trailing garbage")
# Must have a context aware store - add on a normal Graph
# discards anything where the ctx != graph.identifier
if context:
self.sink.get_context(context).add((subject, predicate, obj))
else:
self.sink.default_context.add((subject, predicate, obj))
@@ -0,0 +1,385 @@
"""\
N-Triples Parser
License: GPL 2, W3C, BSD, or MIT
Author: Sean B. Palmer, inamidst.com
"""
from __future__ import annotations
import codecs
import re
from io import BytesIO, StringIO, TextIOBase
from typing import (
IO,
TYPE_CHECKING,
Any,
Match,
MutableMapping,
Optional,
Pattern,
TextIO,
Union,
)
from rdflib.compat import _string_escape_map, decodeUnicodeEscape
from rdflib.exceptions import ParserError as ParseError
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode as bNode
from rdflib.term import Literal, URIRef
from rdflib.term import URIRef as URI # noqa: N814
if TYPE_CHECKING:
import typing_extensions as te
from rdflib.graph import Graph, _ObjectType, _PredicateType, _SubjectType
__all__ = [
"unquote",
"uriquote",
"W3CNTriplesParser",
"NTGraphSink",
"NTParser",
"DummySink",
]
uriref = r'<([^:]+:[^\s"<>]*)>'
literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
litinfo = r"(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)|\^\^" + uriref + r")?"
r_line = re.compile(r"([^\r\n]*)(?:\r\n|\r|\n)")
r_wspace = re.compile(r"[ \t]*")
r_wspaces = re.compile(r"[ \t]+")
r_tail = re.compile(r"[ \t]*\.[ \t]*(#.*)?")
r_uriref = re.compile(uriref)
r_nodeid = re.compile(r"_:([A-Za-z0-9_:]([-A-Za-z0-9_:\.]*[-A-Za-z0-9_:])?)")
r_literal = re.compile(literal + litinfo)
bufsiz = 2048
validate = False
class DummySink:
def __init__(self):
self.length = 0
def triple(self, s, p, o):
self.length += 1
print(s, p, o)
r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
r_quot = re.compile(r"""\\([tbnrf"'\\])""")
r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")
def unquote(s: str) -> str:
"""Unquote an N-Triples string."""
if not validate:
if isinstance(s, str): # nquads
s = decodeUnicodeEscape(s)
else:
s = s.decode("unicode-escape") # type: ignore[unreachable]
return s
else:
result = []
while s:
m = r_safe.match(s)
if m:
s = s[m.end() :]
result.append(m.group(1))
continue
m = r_quot.match(s)
if m:
s = s[2:]
result.append(_string_escape_map[m.group(1)])
continue
m = r_uniquot.match(s)
if m:
s = s[m.end() :]
u, U = m.groups() # noqa: N806
codepoint = int(u or U, 16)
if codepoint > 0x10FFFF:
raise ParseError("Disallowed codepoint: %08X" % codepoint)
result.append(chr(codepoint))
elif s.startswith("\\"):
raise ParseError("Illegal escape at: %s..." % s[:10])
else:
raise ParseError("Illegal literal character: %r" % s[0])
return "".join(result)
r_hibyte = re.compile(r"([\x80-\xFF])")
def uriquote(uri: str) -> str:
if not validate:
return uri
else:
return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri)
_BNodeContextType = MutableMapping[str, bNode]
class W3CNTriplesParser:
"""An N-Triples Parser.
This is a legacy-style Triples parser for NTriples provided by W3C
Usage::
p = W3CNTriplesParser(sink=MySink())
sink = p.parse(f) # file; use parsestring for a string
To define a context in which blank node identifiers refer to the same blank node
across instances of NTriplesParser, pass the same dict as ``bnode_context`` to each
instance. By default, a new blank node context is created for each instance of
`W3CNTriplesParser`.
"""
__slots__ = ("_bnode_ids", "sink", "buffer", "file", "line", "skolemize")
def __init__(
self,
sink: Optional[Union[DummySink, NTGraphSink]] = None,
bnode_context: Optional[_BNodeContextType] = None,
):
self.skolemize = False
if bnode_context is not None:
self._bnode_ids = bnode_context
else:
self._bnode_ids = {}
self.sink: Union[DummySink, NTGraphSink]
if sink is not None:
self.sink = sink
else:
self.sink = DummySink()
self.buffer: Optional[str] = None
self.file: Optional[Union[TextIO, codecs.StreamReader]] = None
self.line: Optional[str] = ""
def parse(
self,
f: Union[TextIO, IO[bytes], codecs.StreamReader],
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
) -> Union[DummySink, NTGraphSink]:
"""
Parse f as an N-Triples file.
:type f: :term:`file object`
:param f: the N-Triples source
:type bnode_context: `dict`, optional
:param bnode_context: a dict mapping blank node identifiers (e.g., ``a`` in ``_:a``)
to `~rdflib.term.BNode` instances. An empty dict can be
passed in to define a distinct context for a given call to
`parse`.
"""
if not hasattr(f, "read"):
raise ParseError("Item to parse must be a file-like object.")
if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"):
# someone still using a bytestream here?
f = codecs.getreader("utf-8")(f)
self.skolemize = skolemize
self.file = f # type: ignore[assignment]
self.buffer = ""
while True:
self.line = self.readline()
if self.line is None:
break
try:
self.parseline(bnode_context=bnode_context)
except ParseError:
raise ParseError("Invalid line: {}".format(self.line))
return self.sink
def parsestring(self, s: Union[bytes, bytearray, str], **kwargs) -> None:
"""Parse s as an N-Triples string."""
if not isinstance(s, (str, bytes, bytearray)):
raise ParseError("Item to parse must be a string instance.")
f: Union[codecs.StreamReader, StringIO]
if isinstance(s, (bytes, bytearray)):
f = codecs.getreader("utf-8")(BytesIO(s))
else:
f = StringIO(s)
self.parse(f, **kwargs)
def readline(self) -> Optional[str]:
"""Read an N-Triples line from buffered input."""
# N-Triples lines end in either CRLF, CR, or LF
# Therefore, we can't just use f.readline()
if not self.buffer:
# type error: Item "None" of "Union[TextIO, StreamReader, None]" has no attribute "read"
buffer = self.file.read(bufsiz) # type: ignore[union-attr]
if not buffer:
return None
self.buffer = buffer
while True:
m = r_line.match(self.buffer)
if m: # the more likely prospect
self.buffer = self.buffer[m.end() :]
return m.group(1)
else:
# type error: Item "None" of "Union[TextIO, StreamReader, None]" has no attribute "read"
buffer = self.file.read(bufsiz) # type: ignore[union-attr]
if not buffer and not self.buffer.isspace():
# Last line does not need to be terminated with a newline
buffer += "\n"
elif not buffer:
return None
self.buffer += buffer
def parseline(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
self.eat(r_wspace)
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
subject = self.subject(bnode_context)
self.eat(r_wspaces)
predicate = self.predicate()
self.eat(r_wspaces)
object_ = self.object(bnode_context)
self.eat(r_tail)
if self.line:
raise ParseError("Trailing garbage: {}".format(self.line))
self.sink.triple(subject, predicate, object_)
def peek(self, token: str) -> bool:
return self.line.startswith(token) # type: ignore[union-attr]
def eat(self, pattern: Pattern[str]) -> Match[str]:
m = pattern.match(self.line) # type: ignore[arg-type]
if not m: # @@ Why can't we get the original pattern?
# print(dir(pattern))
# print repr(self.line), type(self.line)
raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
self.line = self.line[m.end() :] # type: ignore[index]
return m
def subject(self, bnode_context=None) -> Union[bNode, URIRef]:
# @@ Consider using dictionary cases
subj = self.uriref() or self.nodeid(bnode_context)
if not subj:
raise ParseError("Subject must be uriref or nodeID")
return subj
def predicate(self) -> Union[bNode, URIRef]:
pred = self.uriref()
if not pred:
raise ParseError("Predicate must be uriref")
return pred
def object(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[URI, bNode, Literal]:
objt = self.uriref() or self.nodeid(bnode_context) or self.literal()
if objt is False:
raise ParseError("Unrecognised object type")
return objt
def uriref(self) -> Union[te.Literal[False], URI]:
if self.peek("<"):
uri = self.eat(r_uriref).group(1)
uri = unquote(uri)
uri = uriquote(uri)
return URI(uri)
return False
def nodeid(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[te.Literal[False], bNode, URI]:
if self.peek("_"):
if self.skolemize:
bnode_id = self.eat(r_nodeid).group(1)
return bNode(bnode_id).skolemize()
else:
# Fix for https://github.com/RDFLib/rdflib/issues/204
if bnode_context is None:
bnode_context = self._bnode_ids
bnode_id = self.eat(r_nodeid).group(1)
new_id = bnode_context.get(bnode_id, None)
if new_id is not None:
# Re-map to id specific to this doc
return bNode(new_id)
else:
# Replace with freshly-generated document-specific BNode id
bnode = bNode()
# Store the mapping
bnode_context[bnode_id] = bnode
return bnode
return False
def literal(self) -> Union[te.Literal[False], Literal]:
if self.peek('"'):
lit, lang, dtype = self.eat(r_literal).groups()
if lang:
lang = lang
else:
lang = None
if dtype:
dtype = unquote(dtype)
dtype = uriquote(dtype)
dtype = URI(dtype)
else:
dtype = None
if lang and dtype:
raise ParseError("Can't have both a language and a datatype")
lit = unquote(lit)
return Literal(lit, lang, dtype)
return False
class NTGraphSink:
__slots__ = ("g",)
def __init__(self, graph: Graph):
self.g = graph
def triple(self, s: _SubjectType, p: _PredicateType, o: _ObjectType) -> None:
self.g.add((s, p, o))
class NTParser(Parser):
"""parser for the ntriples format, often stored with the .nt extension
See http://www.w3.org/TR/rdf-testcases/#ntriples"""
__slots__ = ()
@classmethod
def parse(cls, source: InputSource, sink: Graph, **kwargs: Any) -> None:
"""
Parse the NT format
:type source: `rdflib.parser.InputSource`
:param source: the source of NT-formatted data
:type sink: `rdflib.graph.Graph`
:param sink: where to send parsed triples
:param kwargs: Additional arguments to pass to `.W3CNTriplesParser.parse`
"""
f: Union[TextIO, IO[bytes], codecs.StreamReader]
f = source.getCharacterStream()
if not f:
b = source.getByteStream()
# TextIOBase includes: StringIO and TextIOWrapper
if isinstance(b, TextIOBase):
# f is not really a ByteStream, but a CharacterStream
f = b # type: ignore[assignment]
else:
# since N-Triples 1.1 files can and should be utf-8 encoded
f = codecs.getreader("utf-8")(b)
parser = W3CNTriplesParser(NTGraphSink(sink))
parser.parse(f, **kwargs)
f.close()
@@ -0,0 +1,183 @@
from __future__ import annotations
from codecs import getreader
from enum import Enum
from typing import TYPE_CHECKING, Any, MutableMapping, Optional, Union
from rdflib.exceptions import ParserError as ParseError
from rdflib.graph import Dataset
from rdflib.parser import InputSource
from rdflib.plugins.parsers.nquads import NQuadsParser
# Build up from the NTriples parser:
from rdflib.plugins.parsers.ntriples import r_nodeid, r_tail, r_uriref, r_wspace
from rdflib.term import BNode, URIRef
if TYPE_CHECKING:
import typing_extensions as te
__all__ = ["RDFPatchParser", "Operation"]
_BNodeContextType = MutableMapping[str, BNode]
class Operation(Enum):
"""
Enum of RDF Patch operations.
Operations:
- `AddTripleOrQuad` (A): Adds a triple or quad.
- `DeleteTripleOrQuad` (D): Deletes a triple or quad.
- `AddPrefix` (PA): Adds a prefix.
- `DeletePrefix` (PD): Deletes a prefix.
- `TransactionStart` (TX): Starts a transaction.
- `TransactionCommit` (TC): Commits a transaction.
- `TransactionAbort` (TA): Aborts a transaction.
- `Header` (H): Specifies a header.
"""
AddTripleOrQuad = "A"
DeleteTripleOrQuad = "D"
AddPrefix = "PA"
DeletePrefix = "PD"
TransactionStart = "TX"
TransactionCommit = "TC"
TransactionAbort = "TA"
Header = "H"
class RDFPatchParser(NQuadsParser):
def parse( # type: ignore[override]
self,
inputsource: InputSource,
sink: Dataset,
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
**kwargs: Any,
) -> Dataset:
"""
Parse inputsource as an RDF Patch file.
:type inputsource: `rdflib.parser.InputSource`
:param inputsource: the source of RDF Patch formatted data
:type sink: `rdflib.graph.Dataset`
:param sink: where to send parsed data
:type bnode_context: `dict`, optional
:param bnode_context: a dict mapping blank node identifiers to `~rdflib.term.BNode` instances.
See `.W3CNTriplesParser.parse`
"""
assert sink.store.context_aware, (
"RDFPatchParser must be given" " a context aware store."
)
# type error: Incompatible types in assignment (expression has type "ConjunctiveGraph", base class "W3CNTriplesParser" defined the type as "Union[DummySink, NTGraphSink]")
self.sink: Dataset = Dataset(store=sink.store)
self.skolemize = skolemize
source = inputsource.getCharacterStream()
if not source:
source = inputsource.getByteStream()
source = getreader("utf-8")(source)
if not hasattr(source, "read"):
raise ParseError("Item to parse must be a file-like object.")
self.file = source
self.buffer = ""
while True:
self.line = __line = self.readline()
if self.line is None:
break
try:
self.parsepatch(bnode_context)
except ParseError as msg:
raise ParseError("Invalid line (%s):\n%r" % (msg, __line))
return self.sink
def parsepatch(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
self.eat(r_wspace)
# From spec: "No comments should be included (comments start # and run to end
# of line)."
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
# if header, transaction, skip
operation = self.operation()
self.eat(r_wspace)
if operation in [Operation.AddTripleOrQuad, Operation.DeleteTripleOrQuad]:
self.add_or_remove_triple_or_quad(operation, bnode_context)
elif operation == Operation.AddPrefix:
self.add_prefix()
elif operation == Operation.DeletePrefix:
self.delete_prefix()
def add_or_remove_triple_or_quad(
self, operation, bnode_context: Optional[_BNodeContextType] = None
) -> None:
self.eat(r_wspace)
if (not self.line) or self.line.startswith("#"):
return # The line is empty or a comment
subject = self.labeled_bnode() or self.subject(bnode_context)
self.eat(r_wspace)
predicate = self.predicate()
self.eat(r_wspace)
obj = self.labeled_bnode() or self.object(bnode_context)
self.eat(r_wspace)
context = self.labeled_bnode() or self.uriref() or self.nodeid(bnode_context)
self.eat(r_tail)
if self.line:
raise ParseError("Trailing garbage")
# Must have a context aware store - add on a normal Graph
# discards anything where the ctx != graph.identifier
if operation == Operation.AddTripleOrQuad:
if context:
self.sink.get_context(context).add((subject, predicate, obj))
else:
self.sink.default_context.add((subject, predicate, obj))
elif operation == Operation.DeleteTripleOrQuad:
if context:
self.sink.get_context(context).remove((subject, predicate, obj))
else:
self.sink.default_context.remove((subject, predicate, obj))
def add_prefix(self):
# Extract prefix and URI from the line
prefix, ns, _ = self.line.replace('"', "").replace("'", "").split(" ") # type: ignore[union-attr]
ns_stripped = ns.strip("<>")
self.sink.bind(prefix, ns_stripped)
def delete_prefix(self):
prefix, _, _ = self.line.replace('"', "").replace("'", "").split(" ") # type: ignore[union-attr]
self.sink.namespace_manager.bind(prefix, None, replace=True)
def operation(self) -> Operation:
for op in Operation:
if self.line.startswith(op.value): # type: ignore[union-attr]
self.eat_op(op.value)
return op
raise ValueError(
f'Invalid or no Operation found in line: "{self.line}". Valid Operations '
f"codes are {', '.join([op.value for op in Operation])}"
)
def eat_op(self, op: str) -> None:
self.line = self.line.lstrip(op) # type: ignore[union-attr]
def nodeid(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[te.Literal[False], BNode, URIRef]:
if self.peek("_"):
return BNode(self.eat(r_nodeid).group(1))
return False
def labeled_bnode(self):
if self.peek("<_"):
plain_uri = self.eat(r_uriref).group(1)
bnode_id = r_nodeid.match(plain_uri).group(1) # type: ignore[union-attr]
return BNode(bnode_id)
return False
@@ -0,0 +1,651 @@
"""
An RDF/XML parser for RDFLib
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
from urllib.parse import urldefrag, urljoin
from xml.sax import handler, make_parser, xmlreader
from xml.sax.handler import ErrorHandler
from xml.sax.saxutils import escape, quoteattr
from rdflib.exceptions import Error, ParserError
from rdflib.graph import Graph
from rdflib.namespace import RDF, is_ncname
from rdflib.parser import InputSource, Parser
from rdflib.plugins.parsers.RDFVOC import RDFVOC
from rdflib.term import BNode, Identifier, Literal, URIRef
if TYPE_CHECKING:
# from xml.sax.expatreader import ExpatLocator
from xml.sax.xmlreader import AttributesImpl, Locator
from rdflib.graph import _ObjectType, _SubjectType, _TripleType
__all__ = ["create_parser", "BagID", "ElementHandler", "RDFXMLHandler", "RDFXMLParser"]
RDFNS = RDFVOC
# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
# A mapping from unqualified terms to their qualified version.
UNQUALIFIED = {
"about": RDFVOC.about,
"ID": RDFVOC.ID,
"type": RDFVOC.type,
"resource": RDFVOC.resource,
"parseType": RDFVOC.parseType,
}
# http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms
CORE_SYNTAX_TERMS = [
RDFVOC.RDF,
RDFVOC.ID,
RDFVOC.about,
RDFVOC.parseType,
RDFVOC.resource,
RDFVOC.nodeID,
RDFVOC.datatype,
]
# http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms
SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDFVOC.Description, RDFVOC.li]
# http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms
OLD_TERMS = [
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"),
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"),
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"),
]
NODE_ELEMENT_EXCEPTIONS = (
CORE_SYNTAX_TERMS
+ [
RDFVOC.li,
]
+ OLD_TERMS
)
NODE_ELEMENT_ATTRIBUTES = [RDFVOC.ID, RDFVOC.nodeID, RDFVOC.about]
PROPERTY_ELEMENT_EXCEPTIONS = (
CORE_SYNTAX_TERMS
+ [
RDFVOC.Description,
]
+ OLD_TERMS
)
PROPERTY_ATTRIBUTE_EXCEPTIONS = (
CORE_SYNTAX_TERMS + [RDFVOC.Description, RDFVOC.li] + OLD_TERMS
)
PROPERTY_ELEMENT_ATTRIBUTES = [RDFVOC.ID, RDFVOC.resource, RDFVOC.nodeID]
XMLNS = "http://www.w3.org/XML/1998/namespace"
BASE = (XMLNS, "base")
LANG = (XMLNS, "lang")
class BagID(URIRef):
__slots__ = ["li"]
def __init__(self, val):
# type error: Too many arguments for "__init__" of "object"
super(URIRef, self).__init__(val) # type: ignore[call-arg]
self.li = 0
def next_li(self):
self.li += 1
# type error: Type expected within [...]
return RDFNS["_%s" % self.li] # type: ignore[misc]
class ElementHandler:
__slots__ = [
"start",
"char",
"end",
"li",
"id",
"base",
"subject",
"predicate",
"object",
"list",
"language",
"datatype",
"declared",
"data",
]
def __init__(self):
self.start = None
self.char = None
self.end = None
self.li = 0
self.id = None
self.base = None
self.subject = None
self.object = None
self.list = None
self.language = None
self.datatype = None
self.declared = None
self.data = None
def next_li(self):
self.li += 1
return RDFVOC["_%s" % self.li]
class RDFXMLHandler(handler.ContentHandler):
def __init__(self, store: Graph):
self.store = store
self.preserve_bnode_ids = False
self.reset()
def reset(self) -> None:
document_element = ElementHandler()
document_element.start = self.document_element_start
document_element.end = lambda name, qname: None
self.stack: List[Optional[ElementHandler]] = [
None,
document_element,
]
self.ids: Dict[str, int] = {} # remember IDs we have already seen
self.bnode: Dict[str, Identifier] = {}
self._ns_contexts: List[Dict[str, Optional[str]]] = [
{}
] # contains uri -> prefix dicts
self._current_context: Dict[str, Optional[str]] = self._ns_contexts[-1]
# ContentHandler methods
def setDocumentLocator(self, locator: Locator):
self.locator = locator
def startDocument(self) -> None:
pass
def startPrefixMapping(self, prefix: Optional[str], namespace: str) -> None:
self._ns_contexts.append(self._current_context.copy())
self._current_context[namespace] = prefix
self.store.bind(prefix, namespace or "", override=False)
def endPrefixMapping(self, prefix: Optional[str]) -> None:
self._current_context = self._ns_contexts[-1]
del self._ns_contexts[-1]
def startElementNS(
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
) -> None:
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
# type error: No overlaod for "get" of "AttributesImpl" mactches tuple (str, str)
base = attrs.get(BASE, None) # type: ignore[call-overload, unused-ignore]
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
# type error: No overlaod for "get" of "AttributesImpl" mactches tuple (str, str)
language = attrs.get(LANG, None) # type: ignore[call-overload, unused-ignore]
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
def endElementNS(self, name: Tuple[Optional[str], str], qname) -> None:
self.current.end(name, qname)
self.stack.pop()
def characters(self, content: str) -> None:
char = self.current.char
if char:
char(content)
def ignorableWhitespace(self, content) -> None:
pass
def processingInstruction(self, target, data) -> None:
pass
def add_reified(self, sid: Identifier, spo: _TripleType):
s, p, o = spo
self.store.add((sid, RDF.type, RDF.Statement))
self.store.add((sid, RDF.subject, s))
self.store.add((sid, RDF.predicate, p))
self.store.add((sid, RDF.object, o))
def error(self, message: str) -> NoReturn:
locator = self.locator
info = "%s:%s:%s: " % (
locator.getSystemId(),
locator.getLineNumber(),
locator.getColumnNumber(),
)
raise ParserError(info + message)
def get_current(self) -> Optional[ElementHandler]:
return self.stack[-2]
# Create a read only property called current so that self.current
# give the current element handler.
current = property(get_current)
def get_next(self) -> Optional[ElementHandler]:
return self.stack[-1]
# Create a read only property that gives the element handler to be
# used for the next element.
next = property(get_next)
def get_parent(self) -> Optional[ElementHandler]:
return self.stack[-3]
# Create a read only property that gives the current parent
# element handler
parent = property(get_parent)
def absolutize(self, uri: str) -> URIRef:
# type error: Argument "allow_fragments" to "urljoin" has incompatible type "int"; expected "bool"
result = urljoin(self.current.base, uri, allow_fragments=1) # type: ignore[arg-type]
if uri and uri[-1] == "#" and result[-1] != "#":
result = "%s#" % result
return URIRef(result)
def convert(
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
) -> Tuple[URIRef, Dict[URIRef, str]]:
if name[0] is None:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[Optional[str], str]")
name = URIRef(name[1]) # type: ignore[assignment]
else:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[Optional[str], str]")
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Optional[str], str]"; expected "Iterable[str]"
name = URIRef("".join(name)) # type: ignore[assignment, arg-type]
atts = {}
for n, v in attrs.items():
# mypy error: mypy thinks n[0]==None is unreachable
if n[0] is None:
att = n[1] # type: ignore[unreachable, unused-ignore]
else:
att = "".join(n)
if att.startswith(XMLNS) or att[0:3].lower() == "xml":
pass
elif att in UNQUALIFIED:
# if not RDFNS[att] in atts:
# type error: Variable "att" is not valid as a type
atts[RDFNS[att]] = v # type: ignore[misc, valid-type]
else:
atts[URIRef(att)] = v
# type error: Incompatible return value type (got "Tuple[Tuple[Optional[str], str], Dict[Any, Any]]", expected "Tuple[URIRef, Dict[URIRef, str]]")
return name, atts # type: ignore[return-value]
def document_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
if name[0] and URIRef("".join(name)) == RDFVOC.RDF:
next = self.next
next.start = self.node_element_start
next.end = self.node_element_end
else:
self.node_element_start(name, qname, attrs)
# self.current.end = self.node_element_end
# TODO... set end to something that sets start such that
# another element will cause error
def node_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[str, str]")
name, atts = self.convert(name, qname, attrs) # type: ignore[assignment]
current = self.current
absolutize = self.absolutize
next = self.next
next.start = self.property_element_start
next.end = self.property_element_end
if name in NODE_ELEMENT_EXCEPTIONS:
# type error: Not all arguments converted during string formatting
self.error("Invalid node element URI: %s" % name) # type: ignore[str-format]
subject: _SubjectType
if RDFVOC.ID in atts:
if RDFVOC.about in atts or RDFVOC.nodeID in atts:
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
id = atts[RDFVOC.ID]
if not is_ncname(id):
self.error("rdf:ID value is not a valid NCName: %s" % id)
subject = absolutize("#%s" % id)
if subject in self.ids:
self.error("two elements cannot use the same ID: '%s'" % subject)
self.ids[subject] = 1 # IDs can only appear once within a document
elif RDFVOC.nodeID in atts:
if RDFVOC.ID in atts or RDFVOC.about in atts:
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
nodeID = atts[RDFVOC.nodeID]
if not is_ncname(nodeID):
self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID)
if self.preserve_bnode_ids is False:
if nodeID in self.bnode:
subject = self.bnode[nodeID]
else:
subject = BNode()
self.bnode[nodeID] = subject
else:
subject = BNode(nodeID)
elif RDFVOC.about in atts:
if RDFVOC.ID in atts or RDFVOC.nodeID in atts:
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
subject = absolutize(atts[RDFVOC.about])
else:
subject = BNode()
if name != RDFVOC.Description: # S1
# error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
self.store.add((subject, RDF.type, absolutize(name))) # type: ignore[arg-type]
object: _ObjectType
language = current.language
for att in atts:
if not att.startswith(str(RDFNS)):
predicate = absolutize(att)
try:
object = Literal(atts[att], language)
except Error as e:
# type error: Argument 1 to "error" of "RDFXMLHandler" has incompatible type "Optional[str]"; expected "str"
self.error(e.msg) # type: ignore[arg-type]
elif att == RDF.type: # S2
predicate = RDF.type
object = absolutize(atts[RDF.type])
elif att in NODE_ELEMENT_ATTRIBUTES:
continue
elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: # S3
self.error("Invalid property attribute URI: %s" % att)
# type error: Statement is unreachable
continue # type: ignore[unreachable] # for when error does not throw an exception
else:
predicate = absolutize(att)
try:
object = Literal(atts[att], language)
except Error as e:
# type error: Argument 1 to "error" of "RDFXMLHandler" has incompatible type "Optional[str]"; expected "str"
self.error(e.msg) # type: ignore[arg-type]
self.store.add((subject, predicate, object))
current.subject = subject
def node_element_end(self, name: Tuple[str, str], qname) -> None:
# repeat node-elements are only allowed
# at at top-level
if self.parent.object and self.current != self.stack[2]:
self.error(
"Repeat node-elements inside property elements: %s" % "".join(name)
)
self.parent.object = self.current.subject
def property_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[str, str]")
name, atts = self.convert(name, qname, attrs) # type: ignore[assignment]
current = self.current
absolutize = self.absolutize
next = self.next
object: Optional[_ObjectType] = None
current.data = None
current.list = None
# type error: "Tuple[str, str]" has no attribute "startswith"
if not name.startswith(str(RDFNS)): # type: ignore[attr-defined]
# type error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
current.predicate = absolutize(name) # type: ignore[arg-type]
elif name == RDFVOC.li:
current.predicate = current.next_li()
elif name in PROPERTY_ELEMENT_EXCEPTIONS:
# type error: Not all arguments converted during string formatting
self.error("Invalid property element URI: %s" % name) # type: ignore[str-format]
else:
# type error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
current.predicate = absolutize(name) # type: ignore[arg-type]
id = atts.get(RDFVOC.ID, None)
if id is not None:
if not is_ncname(id):
self.error("rdf:ID value is not a value NCName: %s" % id)
current.id = absolutize("#%s" % id)
else:
current.id = None
resource = atts.get(RDFVOC.resource, None)
nodeID = atts.get(RDFVOC.nodeID, None)
parse_type = atts.get(RDFVOC.parseType, None)
if resource is not None and nodeID is not None:
self.error("Property element cannot have both rdf:nodeID and rdf:resource")
if resource is not None:
object = absolutize(resource)
next.start = self.node_element_start
next.end = self.node_element_end
elif nodeID is not None:
if not is_ncname(nodeID):
self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID)
if self.preserve_bnode_ids is False:
if nodeID in self.bnode:
object = self.bnode[nodeID]
else:
subject = BNode()
self.bnode[nodeID] = subject
object = subject
else:
object = subject = BNode(nodeID)
next.start = self.node_element_start
next.end = self.node_element_end
else:
if parse_type is not None:
for att in atts:
if att != RDFVOC.parseType and att != RDFVOC.ID:
self.error("Property attr '%s' now allowed here" % att)
if parse_type == "Resource":
current.subject = object = BNode()
current.char = self.property_element_char
next.start = self.property_element_start
next.end = self.property_element_end
elif parse_type == "Collection":
current.char = None
object = current.list = RDF.nil # BNode()
# self.parent.subject
next.start = self.node_element_start
next.end = self.list_node_element_end
else: # if parse_type=="Literal":
# All other values are treated as Literal
# See: http://www.w3.org/TR/rdf-syntax-grammar/
# parseTypeOtherPropertyElt
object = Literal("", datatype=RDFVOC.XMLLiteral)
current.char = self.literal_element_char
current.declared = {XMLNS: "xml"}
next.start = self.literal_element_start
next.char = self.literal_element_char
next.end = self.literal_element_end
current.object = object
return
else:
object = None
current.char = self.property_element_char
next.start = self.node_element_start
next.end = self.node_element_end
datatype = current.datatype = atts.get(RDFVOC.datatype, None)
language = current.language
if datatype is not None:
# TODO: check that there are no atts other than datatype and id
datatype = absolutize(datatype)
else:
for att in atts:
if not att.startswith(str(RDFNS)):
predicate = absolutize(att)
elif att in PROPERTY_ELEMENT_ATTRIBUTES:
continue
elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS:
self.error("""Invalid property attribute URI: %s""" % att)
else:
predicate = absolutize(att)
o: _ObjectType
if att == RDF.type:
o = URIRef(atts[att])
else:
if datatype is not None:
# type error: Statement is unreachable
language = None # type: ignore[unreachable]
o = Literal(atts[att], language, datatype)
if object is None:
object = BNode()
self.store.add((object, predicate, o))
if object is None:
current.data = ""
current.object = None
else:
current.data = None
current.object = object
def property_element_char(self, data: str) -> None:
current = self.current
if current.data is not None:
current.data += data
def property_element_end(self, name: Tuple[str, str], qname) -> None:
current = self.current
if current.data is not None and current.object is None:
literalLang = current.language
if current.datatype is not None:
literalLang = None
current.object = Literal(current.data, literalLang, current.datatype)
current.data = None
if self.next.end == self.list_node_element_end:
if current.object != RDF.nil:
self.store.add((current.list, RDF.rest, RDF.nil))
if current.object is not None:
self.store.add((self.parent.subject, current.predicate, current.object))
if current.id is not None:
self.add_reified(
current.id, (self.parent.subject, current.predicate, current.object)
)
current.subject = None
def list_node_element_end(self, name: Tuple[str, str], qname) -> None:
current = self.current
if self.parent.list == RDF.nil:
list = BNode()
# Removed between 20030123 and 20030905
# self.store.add((list, RDF.type, LIST))
self.parent.list = list
self.store.add((self.parent.list, RDF.first, current.subject))
self.parent.object = list
self.parent.char = None
else:
list = BNode()
# Removed between 20030123 and 20030905
# self.store.add((list, RDF.type, LIST))
self.store.add((self.parent.list, RDF.rest, list))
self.store.add((list, RDF.first, current.subject))
self.parent.list = list
def literal_element_start(
self, name: Tuple[str, str], qname, attrs: AttributesImpl
) -> None:
current = self.current
self.next.start = self.literal_element_start
self.next.char = self.literal_element_char
self.next.end = self.literal_element_end
current.declared = self.parent.declared.copy()
if name[0]:
prefix = self._current_context[name[0]]
if prefix:
current.object = "<%s:%s" % (prefix, name[1])
else:
current.object = "<%s" % name[1]
if not name[0] in current.declared: # noqa: E713
current.declared[name[0]] = prefix
if prefix:
current.object += ' xmlns:%s="%s"' % (prefix, name[0])
else:
current.object += ' xmlns="%s"' % name[0]
else:
current.object = "<%s" % name[1]
# type error: Incompatible types in assignment (expression has type "str", variable has type "Tuple[str, str]")
for name, value in attrs.items(): # type: ignore[assignment, unused-ignore]
if name[0]:
if not name[0] in current.declared: # noqa: E713
current.declared[name[0]] = self._current_context[name[0]]
name = current.declared[name[0]] + ":" + name[1]
else:
# type error: Incompatible types in assignment (expression has type "str", variable has type "Tuple[str, str]")
name = name[1] # type: ignore[assignment]
current.object += " %s=%s" % (name, quoteattr(value))
current.object += ">"
def literal_element_char(self, data: str) -> None:
self.current.object += escape(data)
def literal_element_end(self, name: Tuple[str, str], qname) -> None:
if name[0]:
prefix = self._current_context[name[0]]
if prefix:
end = "</%s:%s>" % (prefix, name[1])
else:
end = "</%s>" % name[1]
else:
end = "</%s>" % name[1]
self.parent.object += self.current.object + end
def create_parser(target: InputSource, store: Graph) -> xmlreader.XMLReader:
parser = make_parser()
try:
# Workaround for bug in expatreader.py. Needed when
# expatreader is trying to guess a prefix.
parser.start_namespace_decl("xml", "http://www.w3.org/XML/1998/namespace") # type: ignore[attr-defined]
except AttributeError:
pass # Not present in Jython (at least)
parser.setFeature(handler.feature_namespaces, 1)
rdfxml = RDFXMLHandler(store)
# type error: Argument 1 to "setDocumentLocator" of "RDFXMLHandler" has incompatible type "InputSource"; expected "Locator"
rdfxml.setDocumentLocator(target) # type: ignore[arg-type]
# rdfxml.setDocumentLocator(_Locator(self.url, self.parser))
parser.setContentHandler(rdfxml)
parser.setErrorHandler(ErrorHandler())
return parser
class RDFXMLParser(Parser):
def __init__(self):
pass
def parse(self, source: InputSource, sink: Graph, **args: Any) -> None:
self._parser = create_parser(source, sink)
content_handler = self._parser.getContentHandler()
preserve_bnode_ids = args.get("preserve_bnode_ids", None)
if preserve_bnode_ids is not None:
# type error: ContentHandler has no attribute "preserve_bnode_ids"
content_handler.preserve_bnode_ids = preserve_bnode_ids # type: ignore[attr-defined, unused-ignore]
# # We're only using it once now
# content_handler.reset()
# self._parser.reset()
self._parser.parse(source)
@@ -0,0 +1,177 @@
from __future__ import annotations
from typing import Any, MutableSequence
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.parser import InputSource, Parser
from .notation3 import RDFSink, SinkParser
def becauseSubGraph(*args, **kwargs): # noqa: N802
pass
class TrigSinkParser(SinkParser):
def directiveOrStatement(self, argstr: str, h: int) -> int: # noqa: N802
# import pdb; pdb.set_trace()
i = self.skipSpace(argstr, h)
if i < 0:
return i # EOF
j = self.graph(argstr, i)
if j >= 0:
return j
j = self.sparqlDirective(argstr, i)
if j >= 0:
return j
j = self.directive(argstr, i)
if j >= 0:
return self.checkDot(argstr, j)
j = self.statement(argstr, i)
if j >= 0:
return self.checkDot(argstr, j)
return j
def labelOrSubject( # noqa: N802
self, argstr: str, i: int, res: MutableSequence[Any]
) -> int:
j = self.skipSpace(argstr, i)
if j < 0:
return j # eof
i = j
j = self.uri_ref2(argstr, i, res)
if j >= 0:
return j
if argstr[i] == "[":
j = self.skipSpace(argstr, i + 1)
if j < 0:
self.BadSyntax(argstr, i, "Expected ] got EOF")
if argstr[j] == "]":
res.append(self.blankNode())
return j + 1
return -1
def graph(self, argstr: str, i: int) -> int:
"""
Parse trig graph, i.e.
<urn:graphname> = { .. triples .. }
return -1 if it doesn't look like a graph-decl
raise Exception if it looks like a graph, but isn't.
"""
need_graphid = False
# import pdb; pdb.set_trace()
j = self.sparqlTok("GRAPH", argstr, i) # optional GRAPH keyword
if j >= 0:
i = j
need_graphid = True
r: MutableSequence[Any] = []
j = self.labelOrSubject(argstr, i, r)
if j >= 0:
graph = r[0]
i = j
elif need_graphid:
self.BadSyntax(argstr, i, "GRAPH keyword must be followed by graph name")
else:
graph = self._store.graph.identifier # hack
j = self.skipSpace(argstr, i)
if j < 0:
self.BadSyntax(argstr, i, "EOF found when expected graph")
if argstr[j : j + 1] == "=": # optional = for legacy support
i = self.skipSpace(argstr, j + 1)
if i < 0:
self.BadSyntax(argstr, i, "EOF found when expecting '{'")
else:
i = j
if argstr[i : i + 1] != "{":
return -1 # the node wasn't part of a graph
j = i + 1
if self._context is not None:
self.BadSyntax(argstr, i, "Nested graphs are not allowed")
oldParentContext = self._parentContext # noqa: N806
self._parentContext = self._context
reason2 = self._reason2
self._reason2 = becauseSubGraph
# type error: Incompatible types in assignment (expression has type "Graph", variable has type "Optional[Formula]")
self._context = self._store.newGraph(graph) # type: ignore[assignment]
while 1:
i = self.skipSpace(argstr, j)
if i < 0:
self.BadSyntax(argstr, i, "needed '}', found end.")
if argstr[i : i + 1] == "}":
j = i + 1
break
j = self.directiveOrStatement(argstr, i)
if j < 0:
self.BadSyntax(argstr, i, "expected statement or '}'")
self._context = self._parentContext
self._reason2 = reason2
self._parentContext = oldParentContext
# res.append(subj.close()) # No use until closed
return j
class TrigParser(Parser):
"""
An RDFLib parser for TriG
"""
def __init__(self):
pass
def parse(self, source: InputSource, graph: Graph, encoding: str = "utf-8") -> None:
if encoding not in [None, "utf-8"]:
raise Exception(
# type error: Unsupported left operand type for % ("Tuple[str, str]")
("TriG files are always utf-8 encoded, ", "I was passed: %s") # type: ignore[operator]
% encoding
)
# we're currently being handed a Graph, not a ConjunctiveGraph
assert graph.store.context_aware, "TriG Parser needs a context-aware store!"
conj_graph = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
conj_graph.default_context = graph # TODO: CG __init__ should have a
# default_context arg
# TODO: update N3Processor so that it can use conj_graph as the sink
conj_graph.namespace_manager = graph.namespace_manager
sink = RDFSink(conj_graph)
baseURI = conj_graph.absolutize( # noqa: N806
source.getPublicId() or source.getSystemId() or ""
)
p = TrigSinkParser(sink, baseURI=baseURI, turtle=True)
stream = source.getCharacterStream() # try to get str stream first
if not stream:
# fallback to get the bytes stream
stream = source.getByteStream()
p.loadStream(stream)
for prefix, namespace in p._bindings.items():
conj_graph.bind(prefix, namespace)
# return ???
@@ -0,0 +1,296 @@
"""
A TriX parser for RDFLib
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
from xml.sax import handler, make_parser
from xml.sax.handler import ErrorHandler
from rdflib.exceptions import ParserError
from rdflib.graph import Graph
from rdflib.namespace import Namespace
from rdflib.parser import InputSource, Parser
from rdflib.store import Store
from rdflib.term import BNode, Identifier, Literal, URIRef
if TYPE_CHECKING:
# from xml.sax.expatreader import ExpatLocator
from xml.sax.xmlreader import AttributesImpl, Locator, XMLReader
__all__ = ["create_parser", "TriXHandler", "TriXParser"]
TRIXNS = Namespace("http://www.w3.org/2004/03/trix/trix-1/")
XMLNS = Namespace("http://www.w3.org/XML/1998/namespace")
class TriXHandler(handler.ContentHandler):
"""An Sax Handler for TriX. See http://sw.nokia.com/trix/"""
lang: Optional[str]
datatype: Optional[str]
def __init__(self, store: Store):
self.store = store
self.preserve_bnode_ids = False
self.reset()
def reset(self) -> None:
self.bnode: Dict[str, BNode] = {}
self.graph: Optional[Graph] = None
self.triple: Optional[List[Identifier]] = None
self.state = 0
self.lang = None
self.datatype = None
# ContentHandler methods
def setDocumentLocator(self, locator: Locator):
self.locator = locator
def startDocument(self) -> None:
pass
def startPrefixMapping(self, prefix: Optional[str], namespace: str) -> None:
pass
def endPrefixMapping(self, prefix: Optional[str]) -> None:
pass
def startElementNS(
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
) -> None:
if name[0] != str(TRIXNS):
self.error(
"Only elements in the TriX namespace are allowed. %s!=%s"
% (name[0], TRIXNS)
)
if name[1].lower() == "trix":
if self.state == 0:
self.state = 1
else:
self.error("Unexpected TriX element")
elif name[1] == "graph":
if self.state == 1:
self.state = 2
else:
self.error("Unexpected graph element")
elif name[1] == "uri":
if self.state == 2:
# the context uri
self.state = 3
elif self.state == 4:
# part of a triple
pass
else:
self.error("Unexpected uri element")
elif name[1] == "triple":
if self.state == 2:
if self.graph is None:
# anonymous graph, create one with random bnode id
self.graph = Graph(store=self.store)
# start of a triple
self.triple = []
self.state = 4
else:
self.error("Unexpected triple element")
elif name[1] == "typedLiteral":
if self.state == 4:
# part of triple
self.lang = None
self.datatype = None
try:
self.lang = attrs.getValue((str(XMLNS), "lang")) # type: ignore[arg-type, unused-ignore]
except Exception:
# language not required - ignore
pass
try:
self.datatype = attrs.getValueByQName("datatype") # type: ignore[arg-type, unused-ignore]
except KeyError:
self.error("No required attribute 'datatype'")
else:
self.error("Unexpected typedLiteral element")
elif name[1] == "plainLiteral":
if self.state == 4:
# part of triple
self.lang = None
self.datatype = None
try:
# type error: Argument 1 to "getValue" of "AttributesImpl" has incompatible type "Tuple[str, str]"; expected "str"
self.lang = attrs.getValue((str(XMLNS), "lang")) # type: ignore[arg-type, unused-ignore]
except Exception:
# language not required - ignore
pass
else:
self.error("Unexpected plainLiteral element")
elif name[1] == "id":
if self.state == 2:
# the context uri
self.state = 3
elif self.state == 4:
# part of triple
pass
else:
self.error("Unexpected id element")
else:
self.error("Unknown element %s in TriX namespace" % name[1])
self.chars = ""
def endElementNS(self, name: Tuple[Optional[str], str], qname) -> None:
if TYPE_CHECKING:
assert self.triple is not None
if name[0] != str(TRIXNS):
self.error(
"Only elements in the TriX namespace are allowed. %s!=%s"
% (name[0], TRIXNS)
)
if name[1] == "uri":
if self.state == 3:
self.graph = Graph(
store=self.store, identifier=URIRef(self.chars.strip())
)
self.state = 2
elif self.state == 4:
self.triple += [URIRef(self.chars.strip())]
else:
self.error(
"Illegal internal self.state - This should never "
+ "happen if the SAX parser ensures XML syntax correctness"
)
elif name[1] == "id":
if self.state == 3:
self.graph = Graph(
self.store, identifier=self.get_bnode(self.chars.strip())
)
self.state = 2
elif self.state == 4:
self.triple += [self.get_bnode(self.chars.strip())]
else:
self.error(
"Illegal internal self.state - This should never "
+ "happen if the SAX parser ensures XML syntax correctness"
)
elif name[1] == "plainLiteral" or name[1] == "typedLiteral":
if self.state == 4:
self.triple += [
Literal(self.chars, lang=self.lang, datatype=self.datatype)
]
else:
self.error(
"This should never happen if the SAX parser "
+ "ensures XML syntax correctness"
)
elif name[1] == "triple":
if self.state == 4:
if len(self.triple) != 3:
self.error(
"Triple has wrong length, got %d elements: %s"
% (len(self.triple), self.triple)
)
# type error: Item "None" of "Optional[Graph]" has no attribute "add"
# type error: Argument 1 to "add" of "Graph" has incompatible type "List[Identifier]"; expected "Tuple[Node, Node, Node]"
self.graph.add(self.triple) # type: ignore[union-attr, arg-type]
# self.store.store.add(self.triple,context=self.graph)
# self.store.addN([self.triple+[self.graph]])
self.state = 2
else:
self.error(
"This should never happen if the SAX parser "
+ "ensures XML syntax correctness"
)
elif name[1] == "graph":
self.graph = None
self.state = 1
elif name[1].lower() == "trix":
self.state = 0
else:
self.error("Unexpected close element")
def get_bnode(self, label: str) -> BNode:
if self.preserve_bnode_ids:
bn = BNode(label)
else:
if label in self.bnode:
bn = self.bnode[label]
else:
bn = BNode(label)
self.bnode[label] = bn
return bn
def characters(self, content: str) -> None:
self.chars += content
def ignorableWhitespace(self, content) -> None:
pass
def processingInstruction(self, target, data) -> None:
pass
def error(self, message: str) -> NoReturn:
locator = self.locator
info = "%s:%s:%s: " % (
locator.getSystemId(),
locator.getLineNumber(),
locator.getColumnNumber(),
)
raise ParserError(info + message)
def create_parser(store: Store) -> XMLReader:
parser = make_parser()
try:
# Workaround for bug in expatreader.py. Needed when
# expatreader is trying to guess a prefix.
# type error: "XMLReader" has no attribute "start_namespace_decl"
parser.start_namespace_decl("xml", "http://www.w3.org/XML/1998/namespace") # type: ignore[attr-defined]
except AttributeError:
pass # Not present in Jython (at least)
parser.setFeature(handler.feature_namespaces, 1)
trix = TriXHandler(store)
parser.setContentHandler(trix)
parser.setErrorHandler(ErrorHandler())
return parser
class TriXParser(Parser):
"""A parser for TriX. See http://sw.nokia.com/trix/"""
def __init__(self):
pass
def parse(self, source: InputSource, sink: Graph, **args: Any) -> None:
assert (
sink.store.context_aware
), "TriXParser must be given a context aware store."
self._parser = create_parser(sink.store)
content_handler = self._parser.getContentHandler()
preserve_bnode_ids = args.get("preserve_bnode_ids", None)
if preserve_bnode_ids is not None:
# type error: ContentHandler has no attribute "preserve_bnode_ids"
content_handler.preserve_bnode_ids = preserve_bnode_ids # type: ignore[attr-defined, unused-ignore]
# We're only using it once now
# content_handler.reset()
# self._parser.reset()
self._parser.parse(source)