2025-12-01
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
from rdflib.namespace import RDF # noqa: N999
|
||||
from rdflib.term import URIRef
|
||||
|
||||
|
||||
class RDFVOC(RDF):
|
||||
_underscore_num = True
|
||||
_fail = True
|
||||
|
||||
# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
|
||||
# A mapping from unqualified terms to their qualified version.
|
||||
RDF: URIRef
|
||||
Description: URIRef
|
||||
ID: URIRef
|
||||
about: URIRef
|
||||
parseType: URIRef # noqa: N815
|
||||
resource: URIRef
|
||||
li: URIRef
|
||||
nodeID: URIRef # noqa: N815
|
||||
datatype: URIRef
|
||||
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
|
||||
"""
|
||||
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
This is a rdflib plugin for parsing Hextuple files, which are Newline-Delimited JSON
|
||||
(ndjson) files, into Conjunctive. The store that backs the graph *must* be able to
|
||||
handle contexts, i.e. multiple graphs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import warnings
|
||||
from io import TextIOWrapper
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, List, Optional, TextIO, Union
|
||||
|
||||
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
|
||||
from rdflib.parser import InputSource, Parser
|
||||
from rdflib.term import BNode, Literal, URIRef
|
||||
|
||||
try:
|
||||
import orjson
|
||||
|
||||
_HAS_ORJSON = True
|
||||
except ImportError:
|
||||
orjson = None # type: ignore[assignment, unused-ignore]
|
||||
_HAS_ORJSON = False
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from io import BufferedReader
|
||||
|
||||
__all__ = ["HextuplesParser"]
|
||||
|
||||
|
||||
class HextuplesParser(Parser):
|
||||
"""
|
||||
An RDFLib parser for Hextuples
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(HextuplesParser, self).__init__()
|
||||
self.default_context: Optional[Graph] = None
|
||||
self.skolemize = False
|
||||
|
||||
def _parse_hextuple(
|
||||
self, ds: Union[Dataset, ConjunctiveGraph], tup: List[Union[str, None]]
|
||||
) -> None:
|
||||
# all values check
|
||||
# subject, predicate, value, datatype cannot be None
|
||||
# language and graph may be None
|
||||
if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None:
|
||||
raise ValueError(
|
||||
f"subject, predicate, value, datatype cannot be None. Given: {tup}"
|
||||
)
|
||||
|
||||
# 1 - subject
|
||||
s: Union[URIRef, BNode]
|
||||
if tup[0].startswith("_"):
|
||||
s = BNode(value=tup[0].replace("_:", ""))
|
||||
if self.skolemize:
|
||||
s = s.skolemize()
|
||||
else:
|
||||
s = URIRef(tup[0])
|
||||
|
||||
# 2 - predicate
|
||||
p = URIRef(tup[1])
|
||||
|
||||
# 3 - value
|
||||
o: Union[URIRef, BNode, Literal]
|
||||
if tup[3] == "globalId":
|
||||
o = URIRef(tup[2])
|
||||
elif tup[3] == "localId":
|
||||
o = BNode(value=tup[2].replace("_:", ""))
|
||||
if self.skolemize:
|
||||
o = o.skolemize()
|
||||
else: # literal
|
||||
if tup[4] is None:
|
||||
o = Literal(tup[2], datatype=URIRef(tup[3]))
|
||||
else:
|
||||
o = Literal(tup[2], lang=tup[4])
|
||||
|
||||
# 6 - context
|
||||
if tup[5] is not None:
|
||||
c = (
|
||||
BNode(tup[5].replace("_:", ""))
|
||||
if tup[5].startswith("_:")
|
||||
else URIRef(tup[5])
|
||||
)
|
||||
if isinstance(c, BNode) and self.skolemize:
|
||||
c = c.skolemize()
|
||||
|
||||
ds.get_context(c).add((s, p, o))
|
||||
elif self.default_context is not None:
|
||||
self.default_context.add((s, p, o))
|
||||
else:
|
||||
raise Exception("No context to parse into!")
|
||||
|
||||
# type error: Signature of "parse" incompatible with supertype "Parser"
|
||||
def parse(self, source: InputSource, graph: Graph, skolemize: bool = False, **kwargs: Any) -> None: # type: ignore[override]
|
||||
if kwargs.get("encoding") not in [None, "utf-8"]:
|
||||
warnings.warn(
|
||||
f"Hextuples files are always utf-8 encoded, "
|
||||
f"I was passed: {kwargs.get('encoding')}, "
|
||||
"but I'm still going to use utf-8"
|
||||
)
|
||||
|
||||
assert (
|
||||
graph.store.context_aware
|
||||
), "Hextuples Parser needs a context-aware store!"
|
||||
|
||||
self.skolemize = skolemize
|
||||
# Set default_union to True to mimic ConjunctiveGraph behavior
|
||||
ds = Dataset(store=graph.store, default_union=True)
|
||||
ds_default = ds.default_context # the DEFAULT_DATASET_GRAPH_ID
|
||||
if isinstance(graph, (Dataset, ConjunctiveGraph)):
|
||||
self.default_context = graph.default_context
|
||||
elif graph.identifier is not None:
|
||||
if graph.identifier == ds_default.identifier:
|
||||
self.default_context = graph
|
||||
else:
|
||||
self.default_context = ds.get_context(graph.identifier)
|
||||
else:
|
||||
# mypy thinks this is unreachable, but graph.identifier can be None
|
||||
self.default_context = ds_default # type: ignore[unreachable]
|
||||
if self.default_context is not ds_default:
|
||||
ds.default_context = self.default_context
|
||||
ds.remove_graph(ds_default) # remove the original unused default graph
|
||||
|
||||
try:
|
||||
text_stream: Optional[TextIO] = source.getCharacterStream()
|
||||
except (AttributeError, LookupError):
|
||||
text_stream = None
|
||||
try:
|
||||
binary_stream: Optional[BinaryIO] = source.getByteStream()
|
||||
except (AttributeError, LookupError):
|
||||
binary_stream = None
|
||||
|
||||
if text_stream is None and binary_stream is None:
|
||||
raise ValueError(
|
||||
f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
|
||||
)
|
||||
if TYPE_CHECKING:
|
||||
assert text_stream is not None or binary_stream is not None
|
||||
use_stream: Union[TextIO, BinaryIO]
|
||||
if _HAS_ORJSON:
|
||||
if binary_stream is not None:
|
||||
use_stream = binary_stream
|
||||
else:
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(text_stream, TextIOWrapper)
|
||||
use_stream = text_stream
|
||||
loads = orjson.loads
|
||||
else:
|
||||
if text_stream is not None:
|
||||
use_stream = text_stream
|
||||
else:
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(binary_stream, BufferedReader)
|
||||
use_stream = TextIOWrapper(binary_stream, encoding="utf-8")
|
||||
loads = json.loads
|
||||
|
||||
for line in use_stream: # type: Union[str, bytes]
|
||||
if len(line) == 0 or line.isspace():
|
||||
# Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
|
||||
# The result is that we accept input that would otherwise be invalid.
|
||||
# Possibly we should just let this result in an error.
|
||||
continue
|
||||
# this complex handing is because the 'value' component is
|
||||
# allowed to be "" but not None
|
||||
# all other "" values are treated as None
|
||||
raw_line: List[str] = loads(line)
|
||||
hex_tuple_line = [x if x != "" else None for x in raw_line]
|
||||
if raw_line[2] == "":
|
||||
hex_tuple_line[2] = ""
|
||||
self._parse_hextuple(ds, hex_tuple_line)
|
||||
@@ -0,0 +1,712 @@
|
||||
"""
|
||||
This parser will interpret a JSON-LD document as an RDF Graph. See:
|
||||
|
||||
http://json-ld.org/
|
||||
|
||||
Example usage::
|
||||
|
||||
>>> from rdflib import Graph, URIRef, Literal
|
||||
>>> test_json = '''
|
||||
... {
|
||||
... "@context": {
|
||||
... "dc": "http://purl.org/dc/terms/",
|
||||
... "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
||||
... "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
|
||||
... },
|
||||
... "@id": "http://example.org/about",
|
||||
... "dc:title": {
|
||||
... "@language": "en",
|
||||
... "@value": "Someone's Homepage"
|
||||
... }
|
||||
... }
|
||||
... '''
|
||||
>>> g = Graph().parse(data=test_json, format='json-ld')
|
||||
>>> list(g) == [(URIRef('http://example.org/about'),
|
||||
... URIRef('http://purl.org/dc/terms/title'),
|
||||
... Literal("Someone's Homepage", lang='en'))]
|
||||
True
|
||||
|
||||
"""
|
||||
|
||||
# From: https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/parser.py
|
||||
|
||||
# NOTE: This code reads the entire JSON object into memory before parsing, but
|
||||
# we should consider streaming the input to deal with arbitrarily large graphs.
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
|
||||
|
||||
import rdflib.parser
|
||||
from rdflib.graph import ConjunctiveGraph, Graph
|
||||
from rdflib.namespace import RDF, XSD
|
||||
from rdflib.parser import InputSource, URLInputSource
|
||||
from rdflib.term import BNode, IdentifiedNode, Literal, Node, URIRef
|
||||
|
||||
from ..shared.jsonld.context import UNDEF, Context, Term
|
||||
from ..shared.jsonld.keys import (
|
||||
CONTEXT,
|
||||
GRAPH,
|
||||
ID,
|
||||
INCLUDED,
|
||||
INDEX,
|
||||
JSON,
|
||||
LANG,
|
||||
LIST,
|
||||
NEST,
|
||||
NONE,
|
||||
REV,
|
||||
SET,
|
||||
TYPE,
|
||||
VALUE,
|
||||
VOCAB,
|
||||
)
|
||||
from ..shared.jsonld.util import (
|
||||
_HAS_ORJSON,
|
||||
VOCAB_DELIMS,
|
||||
context_from_urlinputsource,
|
||||
json,
|
||||
orjson,
|
||||
source_to_json,
|
||||
)
|
||||
|
||||
__all__ = ["JsonLDParser", "to_rdf"]
|
||||
|
||||
TYPE_TERM = Term(str(RDF.type), TYPE, VOCAB) # type: ignore[call-arg]
|
||||
|
||||
ALLOW_LISTS_OF_LISTS = True # NOTE: Not allowed in JSON-LD 1.0
|
||||
|
||||
|
||||
class JsonLDParser(rdflib.parser.Parser):
|
||||
def __init__(self):
|
||||
super(JsonLDParser, self).__init__()
|
||||
|
||||
def parse(
|
||||
self,
|
||||
source: InputSource,
|
||||
sink: Graph,
|
||||
version: float = 1.1,
|
||||
skolemize: bool = False,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
base: Optional[str] = None,
|
||||
context: Optional[
|
||||
Union[
|
||||
List[Union[Dict[str, Any], str, None]],
|
||||
Dict[str, Any],
|
||||
str,
|
||||
]
|
||||
] = None,
|
||||
generalized_rdf: Optional[bool] = False,
|
||||
extract_all_scripts: Optional[bool] = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Parse JSON-LD from a source document.
|
||||
|
||||
The source document can be JSON or HTML with embedded JSON script
|
||||
elements (type attribute = "application/ld+json"). To process as HTML
|
||||
``source.content_type`` must be set to "text/html" or
|
||||
"application/xhtml+xml".
|
||||
|
||||
:param source: InputSource with JSON-formatted data (JSON or HTML)
|
||||
|
||||
:param sink: Graph to receive the parsed triples
|
||||
|
||||
:param version: parse as JSON-LD version, defaults to 1.1
|
||||
|
||||
:param encoding: character encoding of the JSON (should be "utf-8"
|
||||
or "utf-16"), defaults to "utf-8"
|
||||
|
||||
:param base: JSON-LD `Base IRI <https://www.w3.org/TR/json-ld/#base-iri>`_, defaults to None
|
||||
|
||||
:param context: JSON-LD `Context <https://www.w3.org/TR/json-ld/#the-context>`_, defaults to None
|
||||
|
||||
:param generalized_rdf: parse as `Generalized RDF <https://www.w3.org/TR/json-ld/#relationship-to-rdf>`_, defaults to False
|
||||
|
||||
:param extract_all_scripts: if source is an HTML document then extract
|
||||
all script elements, defaults to False (extract only the first
|
||||
script element). This is ignored if ``source.system_id`` contains
|
||||
a fragment identifier, in which case only the script element with
|
||||
matching id attribute is extracted.
|
||||
|
||||
"""
|
||||
if encoding not in ("utf-8", "utf-16"):
|
||||
warnings.warn(
|
||||
"JSON should be encoded as unicode. "
|
||||
"Given encoding was: %s" % encoding
|
||||
)
|
||||
|
||||
if not base:
|
||||
base = sink.absolutize(source.getPublicId() or source.getSystemId() or "")
|
||||
|
||||
context_data = context
|
||||
if not context_data and hasattr(source, "url") and hasattr(source, "links"):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(source, URLInputSource)
|
||||
context_data = context_from_urlinputsource(source)
|
||||
|
||||
try:
|
||||
version = float(version)
|
||||
except ValueError:
|
||||
version = 1.1
|
||||
|
||||
# Get the optional fragment identifier
|
||||
try:
|
||||
fragment_id = URIRef(source.getSystemId()).fragment
|
||||
except Exception:
|
||||
fragment_id = None
|
||||
|
||||
data, html_base = source_to_json(source, fragment_id, extract_all_scripts)
|
||||
if html_base is not None:
|
||||
base = URIRef(html_base, base=base)
|
||||
|
||||
# NOTE: A ConjunctiveGraph parses into a Graph sink, so no sink will be
|
||||
# context_aware. Keeping this check in case RDFLib is changed, or
|
||||
# someone passes something context_aware to this parser directly.
|
||||
conj_sink: Graph
|
||||
if not sink.context_aware:
|
||||
conj_sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier)
|
||||
else:
|
||||
conj_sink = sink
|
||||
|
||||
to_rdf(
|
||||
data,
|
||||
conj_sink,
|
||||
base,
|
||||
context_data,
|
||||
version,
|
||||
bool(generalized_rdf),
|
||||
skolemize=skolemize,
|
||||
)
|
||||
|
||||
|
||||
def to_rdf(
|
||||
data: Any,
|
||||
dataset: Graph,
|
||||
base: Optional[str] = None,
|
||||
context_data: Optional[
|
||||
Union[
|
||||
List[Union[Dict[str, Any], str, None]],
|
||||
Dict[str, Any],
|
||||
str,
|
||||
]
|
||||
] = None,
|
||||
version: Optional[float] = None,
|
||||
generalized_rdf: bool = False,
|
||||
allow_lists_of_lists: Optional[bool] = None,
|
||||
skolemize: bool = False,
|
||||
):
|
||||
# TODO: docstring w. args and return value
|
||||
context = Context(base=base, version=version)
|
||||
if context_data:
|
||||
context.load(context_data)
|
||||
parser = Parser(
|
||||
generalized_rdf=generalized_rdf,
|
||||
allow_lists_of_lists=allow_lists_of_lists,
|
||||
skolemize=skolemize,
|
||||
)
|
||||
return parser.parse(data, context, dataset)
|
||||
|
||||
|
||||
class Parser:
|
||||
def __init__(
|
||||
self,
|
||||
generalized_rdf: bool = False,
|
||||
allow_lists_of_lists: Optional[bool] = None,
|
||||
skolemize: bool = False,
|
||||
):
|
||||
self.skolemize = skolemize
|
||||
self.generalized_rdf = generalized_rdf
|
||||
self.allow_lists_of_lists = (
|
||||
allow_lists_of_lists
|
||||
if allow_lists_of_lists is not None
|
||||
else ALLOW_LISTS_OF_LISTS
|
||||
)
|
||||
self.invalid_uri_to_bnode: dict[str, BNode] = {}
|
||||
|
||||
def parse(self, data: Any, context: Context, dataset: Graph) -> Graph:
|
||||
topcontext = False
|
||||
resources: Union[Dict[str, Any], List[Any]]
|
||||
if isinstance(data, list):
|
||||
resources = data
|
||||
elif isinstance(data, dict):
|
||||
local_context = data.get(CONTEXT)
|
||||
if local_context:
|
||||
context.load(local_context, context.base)
|
||||
topcontext = True
|
||||
resources = data
|
||||
# type error: Subclass of "Dict[str, Any]" and "List[Any]" cannot exist: would have incompatible method signatures
|
||||
if not isinstance(resources, list): # type: ignore[unreachable]
|
||||
resources = [resources]
|
||||
|
||||
if context.vocab:
|
||||
dataset.bind(None, context.vocab)
|
||||
for name, term in context.terms.items():
|
||||
if term.id and term.id.endswith(VOCAB_DELIMS):
|
||||
dataset.bind(name, term.id)
|
||||
|
||||
# type error: "Graph" has no attribute "default_context"
|
||||
graph = dataset.default_context if dataset.context_aware else dataset # type: ignore[attr-defined]
|
||||
|
||||
for node in resources:
|
||||
self._add_to_graph(dataset, graph, context, node, topcontext)
|
||||
|
||||
return graph
|
||||
|
||||
def _add_to_graph(
|
||||
self,
|
||||
dataset: Graph,
|
||||
graph: Graph,
|
||||
context: Context,
|
||||
node: Any,
|
||||
topcontext: bool = False,
|
||||
) -> Optional[Node]:
|
||||
if not isinstance(node, dict) or context.get_value(node):
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
|
||||
if CONTEXT in node and not topcontext:
|
||||
local_context = node[CONTEXT]
|
||||
if local_context:
|
||||
context = context.subcontext(local_context)
|
||||
else:
|
||||
context = Context(base=context.doc_base)
|
||||
|
||||
# type error: Incompatible types in assignment (expression has type "Optional[Context]", variable has type "Context")
|
||||
context = context.get_context_for_type(node) # type: ignore[assignment]
|
||||
|
||||
id_val = context.get_id(node)
|
||||
|
||||
if id_val is None:
|
||||
nested_id = self._get_nested_id(context, node)
|
||||
if nested_id is not None and len(nested_id) > 0:
|
||||
id_val = nested_id
|
||||
|
||||
if isinstance(id_val, str):
|
||||
subj = self._to_rdf_id(context, id_val)
|
||||
else:
|
||||
subj = BNode()
|
||||
if self.skolemize:
|
||||
subj = subj.skolemize()
|
||||
|
||||
if subj is None:
|
||||
return None
|
||||
|
||||
# NOTE: crude way to signify that this node might represent a named graph
|
||||
no_id = id_val is None
|
||||
|
||||
for key, obj in node.items():
|
||||
if key == CONTEXT or key in context.get_keys(ID):
|
||||
continue
|
||||
|
||||
if key == REV or key in context.get_keys(REV):
|
||||
for rkey, robj in obj.items():
|
||||
self._key_to_graph(
|
||||
dataset,
|
||||
graph,
|
||||
context,
|
||||
subj,
|
||||
rkey,
|
||||
robj,
|
||||
reverse=True,
|
||||
no_id=no_id,
|
||||
)
|
||||
else:
|
||||
self._key_to_graph(dataset, graph, context, subj, key, obj, no_id=no_id)
|
||||
|
||||
return subj
|
||||
|
||||
# type error: Missing return statement
|
||||
def _get_nested_id(self, context: Context, node: Dict[str, Any]) -> Optional[str]: # type: ignore[return]
|
||||
for key, obj in node.items():
|
||||
if context.version >= 1.1 and key in context.get_keys(NEST):
|
||||
term = context.terms.get(key)
|
||||
if term and term.id is None:
|
||||
continue
|
||||
objs = obj if isinstance(obj, list) else [obj]
|
||||
for obj in objs:
|
||||
if not isinstance(obj, dict):
|
||||
continue
|
||||
id_val = context.get_id(obj)
|
||||
if not id_val:
|
||||
subcontext = context.get_context_for_term(
|
||||
context.terms.get(key)
|
||||
)
|
||||
id_val = self._get_nested_id(subcontext, obj)
|
||||
if isinstance(id_val, str):
|
||||
return id_val
|
||||
|
||||
def _key_to_graph(
|
||||
self,
|
||||
dataset: Graph,
|
||||
graph: Graph,
|
||||
context: Context,
|
||||
subj: Node,
|
||||
key: str,
|
||||
obj: Any,
|
||||
reverse: bool = False,
|
||||
no_id: bool = False,
|
||||
) -> None:
|
||||
if isinstance(obj, list):
|
||||
obj_nodes = obj
|
||||
else:
|
||||
obj_nodes = [obj]
|
||||
|
||||
term = context.terms.get(key)
|
||||
if term:
|
||||
term_id = term.id
|
||||
if term.type == JSON:
|
||||
obj_nodes = [self._to_typed_json_value(obj)]
|
||||
elif LIST in term.container:
|
||||
obj_nodes = [self._expand_nested_list(obj_nodes)]
|
||||
elif isinstance(obj, dict):
|
||||
obj_nodes = self._parse_container(context, term, obj)
|
||||
else:
|
||||
term_id = None
|
||||
|
||||
if TYPE in (key, term_id):
|
||||
term = TYPE_TERM
|
||||
|
||||
if GRAPH in (key, term_id):
|
||||
if dataset.context_aware and not no_id:
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(dataset, ConjunctiveGraph)
|
||||
# type error: Argument 1 to "get_context" of "ConjunctiveGraph" has incompatible type "Node"; expected "Union[IdentifiedNode, str, None]"
|
||||
subgraph = dataset.get_context(subj) # type: ignore[arg-type]
|
||||
else:
|
||||
subgraph = graph
|
||||
for onode in obj_nodes:
|
||||
self._add_to_graph(dataset, subgraph, context, onode)
|
||||
return
|
||||
|
||||
if SET in (key, term_id):
|
||||
for onode in obj_nodes:
|
||||
self._add_to_graph(dataset, graph, context, onode)
|
||||
return
|
||||
|
||||
if INCLUDED in (key, term_id):
|
||||
for onode in obj_nodes:
|
||||
self._add_to_graph(dataset, graph, context, onode)
|
||||
return
|
||||
|
||||
if context.version >= 1.1 and key in context.get_keys(NEST):
|
||||
term = context.terms.get(key)
|
||||
if term and term.id is None:
|
||||
return
|
||||
objs = obj if isinstance(obj, list) else [obj]
|
||||
for obj in objs:
|
||||
if not isinstance(obj, dict):
|
||||
continue
|
||||
for nkey, nobj in obj.items():
|
||||
# NOTE: we've already captured subject
|
||||
if nkey in context.get_keys(ID):
|
||||
continue
|
||||
subcontext = context.get_context_for_type(obj)
|
||||
# type error: Argument 3 to "_key_to_graph" of "Parser" has incompatible type "Optional[Context]"; expected "Context"
|
||||
self._key_to_graph(dataset, graph, subcontext, subj, nkey, nobj) # type: ignore[arg-type]
|
||||
return
|
||||
|
||||
pred_uri = term.id if term else context.expand(key)
|
||||
|
||||
context = context.get_context_for_term(term)
|
||||
|
||||
# Flatten deep nested lists
|
||||
def flatten(n: Iterable[Any]) -> List[Any]:
|
||||
flattened = []
|
||||
for obj in n:
|
||||
if isinstance(obj, dict):
|
||||
objs = context.get_set(obj)
|
||||
if objs is not None:
|
||||
obj = objs
|
||||
if isinstance(obj, list):
|
||||
flattened += flatten(obj)
|
||||
continue
|
||||
flattened.append(obj)
|
||||
return flattened
|
||||
|
||||
obj_nodes = flatten(obj_nodes)
|
||||
|
||||
if not pred_uri:
|
||||
return
|
||||
|
||||
if term and term.reverse:
|
||||
reverse = not reverse
|
||||
|
||||
pred: IdentifiedNode
|
||||
bid = self._get_bnodeid(pred_uri)
|
||||
if bid:
|
||||
if not self.generalized_rdf:
|
||||
return
|
||||
pred = BNode(bid)
|
||||
if self.skolemize:
|
||||
pred = pred.skolemize()
|
||||
else:
|
||||
pred = URIRef(pred_uri)
|
||||
|
||||
for obj_node in obj_nodes:
|
||||
obj = self._to_object(dataset, graph, context, term, obj_node)
|
||||
if obj is None:
|
||||
continue
|
||||
if reverse:
|
||||
graph.add((obj, pred, subj))
|
||||
else:
|
||||
graph.add((subj, pred, obj))
|
||||
|
||||
def _parse_container(
|
||||
self, context: Context, term: Term, obj: Dict[str, Any]
|
||||
) -> List[Any]:
|
||||
if LANG in term.container:
|
||||
obj_nodes = []
|
||||
for lang, values in obj.items():
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
if lang in context.get_keys(NONE):
|
||||
obj_nodes += values
|
||||
else:
|
||||
for v in values:
|
||||
obj_nodes.append((v, lang))
|
||||
return obj_nodes
|
||||
|
||||
v11 = context.version >= 1.1
|
||||
|
||||
if v11 and GRAPH in term.container and ID in term.container:
|
||||
return [
|
||||
(
|
||||
dict({GRAPH: o})
|
||||
if k in context.get_keys(NONE)
|
||||
else dict({ID: k, GRAPH: o}) if isinstance(o, dict) else o
|
||||
)
|
||||
for k, o in obj.items()
|
||||
]
|
||||
|
||||
elif v11 and GRAPH in term.container and INDEX in term.container:
|
||||
return [dict({GRAPH: o}) for k, o in obj.items()]
|
||||
|
||||
elif v11 and GRAPH in term.container:
|
||||
return [dict({GRAPH: obj})]
|
||||
|
||||
elif v11 and ID in term.container:
|
||||
return [
|
||||
(
|
||||
dict({ID: k}, **o)
|
||||
if isinstance(o, dict) and k not in context.get_keys(NONE)
|
||||
else o
|
||||
)
|
||||
for k, o in obj.items()
|
||||
]
|
||||
|
||||
elif v11 and TYPE in term.container:
|
||||
return [
|
||||
(
|
||||
self._add_type(
|
||||
context,
|
||||
(
|
||||
{ID: context.expand(o) if term.type == VOCAB else o}
|
||||
if isinstance(o, str)
|
||||
else o
|
||||
),
|
||||
k,
|
||||
)
|
||||
if isinstance(o, (dict, str)) and k not in context.get_keys(NONE)
|
||||
else o
|
||||
)
|
||||
for k, o in obj.items()
|
||||
]
|
||||
|
||||
elif INDEX in term.container:
|
||||
obj_nodes = []
|
||||
for key, nodes in obj.items():
|
||||
if not isinstance(nodes, list):
|
||||
nodes = [nodes]
|
||||
for node in nodes:
|
||||
if v11 and term.index and key not in context.get_keys(NONE):
|
||||
if not isinstance(node, dict):
|
||||
node = {ID: node}
|
||||
values = node.get(term.index, [])
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
values.append(key)
|
||||
node[term.index] = values
|
||||
obj_nodes.append(node)
|
||||
return obj_nodes
|
||||
|
||||
return [obj]
|
||||
|
||||
@staticmethod
|
||||
def _add_type(context: Context, o: Dict[str, Any], k: str) -> Dict[str, Any]:
|
||||
otype = context.get_type(o) or []
|
||||
if otype and not isinstance(otype, list):
|
||||
otype = [otype]
|
||||
otype.append(k)
|
||||
o[TYPE] = otype
|
||||
return o
|
||||
|
||||
def _to_object(
|
||||
self,
|
||||
dataset: Graph,
|
||||
graph: Graph,
|
||||
context: Context,
|
||||
term: Optional[Term],
|
||||
node: Any,
|
||||
inlist: bool = False,
|
||||
) -> Optional[Node]:
|
||||
if isinstance(node, tuple):
|
||||
value, lang = node
|
||||
if value is None:
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
if lang and " " in lang:
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
return Literal(value, lang=lang)
|
||||
|
||||
if isinstance(node, dict):
|
||||
node_list = context.get_list(node)
|
||||
if node_list is not None:
|
||||
if inlist and not self.allow_lists_of_lists:
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
listref = self._add_list(dataset, graph, context, term, node_list)
|
||||
if listref:
|
||||
return listref
|
||||
|
||||
else: # expand compacted value
|
||||
if term and term.type:
|
||||
if term.type == JSON:
|
||||
node = self._to_typed_json_value(node)
|
||||
elif node is None:
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
elif term.type == ID and isinstance(node, str):
|
||||
node = {ID: context.resolve(node)}
|
||||
elif term.type == VOCAB and isinstance(node, str):
|
||||
node = {ID: context.expand(node) or context.resolve_iri(node)}
|
||||
else:
|
||||
node = {TYPE: term.type, VALUE: node}
|
||||
else:
|
||||
if node is None:
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
if isinstance(node, float):
|
||||
return Literal(node, datatype=XSD.double)
|
||||
|
||||
if term and term.language is not UNDEF:
|
||||
lang = term.language
|
||||
else:
|
||||
lang = context.language
|
||||
return Literal(node, lang=lang)
|
||||
|
||||
lang = context.get_language(node)
|
||||
datatype = not lang and context.get_type(node) or None
|
||||
value = context.get_value(node)
|
||||
# type error: Unsupported operand types for in ("Optional[Any]" and "Generator[str, None, None]")
|
||||
if datatype in context.get_keys(JSON): # type: ignore[operator]
|
||||
node = self._to_typed_json_value(value)
|
||||
datatype = context.get_type(node)
|
||||
value = context.get_value(node)
|
||||
|
||||
if lang or context.get_key(VALUE) in node or VALUE in node:
|
||||
if value is None:
|
||||
return None
|
||||
if lang:
|
||||
if " " in lang:
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
return Literal(value, lang=lang)
|
||||
elif datatype:
|
||||
return Literal(value, datatype=context.expand(datatype))
|
||||
else:
|
||||
return Literal(value)
|
||||
else:
|
||||
return self._add_to_graph(dataset, graph, context, node)
|
||||
|
||||
def _to_rdf_id(self, context: Context, id_val: str) -> Optional[IdentifiedNode]:
|
||||
bid = self._get_bnodeid(id_val)
|
||||
if bid:
|
||||
b = BNode(bid)
|
||||
if self.skolemize:
|
||||
return b.skolemize()
|
||||
return b
|
||||
else:
|
||||
uri = context.resolve(id_val)
|
||||
if not self.generalized_rdf and ":" not in uri:
|
||||
return None
|
||||
node: IdentifiedNode = URIRef(uri)
|
||||
if not str(node):
|
||||
if id_val not in self.invalid_uri_to_bnode:
|
||||
self.invalid_uri_to_bnode[id_val] = BNode(secrets.token_urlsafe(20))
|
||||
node = self.invalid_uri_to_bnode[id_val]
|
||||
return node
|
||||
|
||||
def _get_bnodeid(self, ref: str) -> Optional[str]:
|
||||
if not ref.startswith("_:"):
|
||||
# type error: Return value expected
|
||||
return # type: ignore[return-value]
|
||||
bid = ref.split("_:", 1)[-1]
|
||||
return bid or None
|
||||
|
||||
def _add_list(
|
||||
self,
|
||||
dataset: Graph,
|
||||
graph: Graph,
|
||||
context: Context,
|
||||
term: Optional[Term],
|
||||
node_list: Any,
|
||||
) -> IdentifiedNode:
|
||||
if not isinstance(node_list, list):
|
||||
node_list = [node_list]
|
||||
|
||||
first_subj: Union[URIRef, BNode] = BNode()
|
||||
if self.skolemize and isinstance(first_subj, BNode):
|
||||
first_subj = first_subj.skolemize()
|
||||
|
||||
rest: Union[URIRef, BNode, None]
|
||||
subj, rest = first_subj, None
|
||||
|
||||
for node in node_list:
|
||||
if node is None:
|
||||
continue
|
||||
|
||||
if rest:
|
||||
# type error: Statement is unreachable
|
||||
graph.add((subj, RDF.rest, rest)) # type: ignore[unreachable]
|
||||
subj = rest
|
||||
|
||||
obj = self._to_object(dataset, graph, context, term, node, inlist=True)
|
||||
|
||||
if obj is None:
|
||||
continue
|
||||
|
||||
graph.add((subj, RDF.first, obj))
|
||||
rest = BNode()
|
||||
if self.skolemize and isinstance(rest, BNode):
|
||||
rest = rest.skolemize()
|
||||
|
||||
if rest:
|
||||
graph.add((subj, RDF.rest, RDF.nil))
|
||||
return first_subj
|
||||
else:
|
||||
return RDF.nil
|
||||
|
||||
@staticmethod
|
||||
def _to_typed_json_value(value: Any) -> Dict[str, str]:
|
||||
if _HAS_ORJSON:
|
||||
val_string: str = orjson.dumps(
|
||||
value,
|
||||
option=orjson.OPT_SORT_KEYS | orjson.OPT_NON_STR_KEYS,
|
||||
).decode("utf-8")
|
||||
else:
|
||||
val_string = json.dumps(
|
||||
value, separators=(",", ":"), sort_keys=True, ensure_ascii=False
|
||||
)
|
||||
return {
|
||||
TYPE: RDF.JSON,
|
||||
VALUE: val_string,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _expand_nested_list(cls, obj_nodes: List[Any]) -> Dict[str, List[Any]]:
|
||||
result = [
|
||||
cls._expand_nested_list(o) if isinstance(o, list) else o for o in obj_nodes
|
||||
]
|
||||
return {LIST: result}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,133 @@
|
||||
"""
|
||||
This is a rdflib plugin for parsing NQuad files into Conjunctive
|
||||
graphs that can be used and queried. The store that backs the graph
|
||||
*must* be able to handle contexts.
|
||||
|
||||
>>> from rdflib import ConjunctiveGraph, URIRef, Namespace
|
||||
>>> g = ConjunctiveGraph()
|
||||
>>> data = open("test/data/nquads.rdflib/example.nquads", "rb")
|
||||
>>> g.parse(data, format="nquads") # doctest:+ELLIPSIS
|
||||
<Graph identifier=... (<class 'rdflib.graph.Graph'>)>
|
||||
>>> assert len(g.store) == 449
|
||||
>>> # There should be 16 separate contexts
|
||||
>>> assert len([x for x in g.store.contexts()]) == 16
|
||||
>>> # is the name of entity E10009 "Arco Publications"?
|
||||
>>> # (in graph http://bibliographica.org/entity/E10009)
|
||||
>>> # Looking for:
|
||||
>>> # <http://bibliographica.org/entity/E10009>
|
||||
>>> # <http://xmlns.com/foaf/0.1/name>
|
||||
>>> # "Arco Publications"
|
||||
>>> # <http://bibliographica.org/entity/E10009>
|
||||
>>> s = URIRef("http://bibliographica.org/entity/E10009")
|
||||
>>> FOAF = Namespace("http://xmlns.com/foaf/0.1/")
|
||||
>>> assert(g.value(s, FOAF.name).eq("Arco Publications"))
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from codecs import getreader
|
||||
from typing import Any, MutableMapping, Optional
|
||||
|
||||
from rdflib.exceptions import ParserError as ParseError
|
||||
from rdflib.graph import ConjunctiveGraph, Dataset, Graph
|
||||
from rdflib.parser import InputSource
|
||||
|
||||
# Build up from the NTriples parser:
|
||||
from rdflib.plugins.parsers.ntriples import W3CNTriplesParser, r_tail, r_wspace
|
||||
from rdflib.term import BNode
|
||||
|
||||
__all__ = ["NQuadsParser"]
|
||||
|
||||
_BNodeContextType = MutableMapping[str, BNode]
|
||||
|
||||
|
||||
class NQuadsParser(W3CNTriplesParser):
|
||||
|
||||
# type error: Signature of "parse" incompatible with supertype "W3CNTriplesParser"
|
||||
def parse( # type: ignore[override]
|
||||
self,
|
||||
inputsource: InputSource,
|
||||
sink: Graph,
|
||||
bnode_context: Optional[_BNodeContextType] = None,
|
||||
skolemize: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Parse inputsource as an N-Quads file.
|
||||
|
||||
:type inputsource: `rdflib.parser.InputSource`
|
||||
:param inputsource: the source of N-Quads-formatted data
|
||||
:type sink: `rdflib.graph.Graph`
|
||||
:param sink: where to send parsed triples
|
||||
:type bnode_context: `dict`, optional
|
||||
:param bnode_context: a dict mapping blank node identifiers to `~rdflib.term.BNode` instances.
|
||||
See `.W3CNTriplesParser.parse`
|
||||
"""
|
||||
assert (
|
||||
sink.store.context_aware
|
||||
), "NQuadsParser must be given a context-aware store."
|
||||
# Set default_union to True to mimic ConjunctiveGraph behavior
|
||||
ds = Dataset(store=sink.store, default_union=True)
|
||||
ds_default = ds.default_context # the DEFAULT_DATASET_GRAPH_ID
|
||||
new_default_context = None
|
||||
if isinstance(sink, (Dataset, ConjunctiveGraph)):
|
||||
new_default_context = sink.default_context
|
||||
elif sink.identifier is not None:
|
||||
if sink.identifier == ds_default.identifier:
|
||||
new_default_context = sink
|
||||
else:
|
||||
new_default_context = ds.get_context(sink.identifier)
|
||||
|
||||
if new_default_context is not None:
|
||||
ds.default_context = new_default_context
|
||||
ds.remove_graph(ds_default) # remove the original unused default graph
|
||||
# type error: Incompatible types in assignment (expression has type "ConjunctiveGraph", base class "W3CNTriplesParser" defined the type as "Union[DummySink, NTGraphSink]")
|
||||
self.sink: Dataset = ds # type: ignore[assignment]
|
||||
self.skolemize = skolemize
|
||||
|
||||
source = inputsource.getCharacterStream()
|
||||
if not source:
|
||||
source = inputsource.getByteStream()
|
||||
source = getreader("utf-8")(source)
|
||||
|
||||
if not hasattr(source, "read"):
|
||||
raise ParseError("Item to parse must be a file-like object.")
|
||||
|
||||
self.file = source
|
||||
self.buffer = ""
|
||||
while True:
|
||||
self.line = __line = self.readline()
|
||||
if self.line is None:
|
||||
break
|
||||
try:
|
||||
self.parseline(bnode_context)
|
||||
except ParseError as msg:
|
||||
raise ParseError("Invalid line (%s):\n%r" % (msg, __line))
|
||||
|
||||
return self.sink
|
||||
|
||||
def parseline(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
|
||||
self.eat(r_wspace)
|
||||
if (not self.line) or self.line.startswith("#"):
|
||||
return # The line is empty or a comment
|
||||
|
||||
subject = self.subject(bnode_context)
|
||||
self.eat(r_wspace)
|
||||
|
||||
predicate = self.predicate()
|
||||
self.eat(r_wspace)
|
||||
|
||||
obj = self.object(bnode_context)
|
||||
self.eat(r_wspace)
|
||||
|
||||
context = self.uriref() or self.nodeid(bnode_context)
|
||||
self.eat(r_tail)
|
||||
|
||||
if self.line:
|
||||
raise ParseError("Trailing garbage")
|
||||
# Must have a context aware store - add on a normal Graph
|
||||
# discards anything where the ctx != graph.identifier
|
||||
if context:
|
||||
self.sink.get_context(context).add((subject, predicate, obj))
|
||||
else:
|
||||
self.sink.default_context.add((subject, predicate, obj))
|
||||
@@ -0,0 +1,385 @@
|
||||
"""\
|
||||
N-Triples Parser
|
||||
License: GPL 2, W3C, BSD, or MIT
|
||||
Author: Sean B. Palmer, inamidst.com
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
import re
|
||||
from io import BytesIO, StringIO, TextIOBase
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Match,
|
||||
MutableMapping,
|
||||
Optional,
|
||||
Pattern,
|
||||
TextIO,
|
||||
Union,
|
||||
)
|
||||
|
||||
from rdflib.compat import _string_escape_map, decodeUnicodeEscape
|
||||
from rdflib.exceptions import ParserError as ParseError
|
||||
from rdflib.parser import InputSource, Parser
|
||||
from rdflib.term import BNode as bNode
|
||||
from rdflib.term import Literal, URIRef
|
||||
from rdflib.term import URIRef as URI # noqa: N814
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import typing_extensions as te
|
||||
|
||||
from rdflib.graph import Graph, _ObjectType, _PredicateType, _SubjectType
|
||||
|
||||
__all__ = [
|
||||
"unquote",
|
||||
"uriquote",
|
||||
"W3CNTriplesParser",
|
||||
"NTGraphSink",
|
||||
"NTParser",
|
||||
"DummySink",
|
||||
]
|
||||
|
||||
uriref = r'<([^:]+:[^\s"<>]*)>'
|
||||
literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
|
||||
litinfo = r"(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)|\^\^" + uriref + r")?"
|
||||
|
||||
r_line = re.compile(r"([^\r\n]*)(?:\r\n|\r|\n)")
|
||||
r_wspace = re.compile(r"[ \t]*")
|
||||
r_wspaces = re.compile(r"[ \t]+")
|
||||
r_tail = re.compile(r"[ \t]*\.[ \t]*(#.*)?")
|
||||
r_uriref = re.compile(uriref)
|
||||
r_nodeid = re.compile(r"_:([A-Za-z0-9_:]([-A-Za-z0-9_:\.]*[-A-Za-z0-9_:])?)")
|
||||
r_literal = re.compile(literal + litinfo)
|
||||
|
||||
bufsiz = 2048
|
||||
validate = False
|
||||
|
||||
|
||||
class DummySink:
|
||||
def __init__(self):
|
||||
self.length = 0
|
||||
|
||||
def triple(self, s, p, o):
|
||||
self.length += 1
|
||||
print(s, p, o)
|
||||
|
||||
|
||||
r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
|
||||
r_quot = re.compile(r"""\\([tbnrf"'\\])""")
|
||||
r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")
|
||||
|
||||
|
||||
def unquote(s: str) -> str:
|
||||
"""Unquote an N-Triples string."""
|
||||
if not validate:
|
||||
if isinstance(s, str): # nquads
|
||||
s = decodeUnicodeEscape(s)
|
||||
else:
|
||||
s = s.decode("unicode-escape") # type: ignore[unreachable]
|
||||
|
||||
return s
|
||||
else:
|
||||
result = []
|
||||
while s:
|
||||
m = r_safe.match(s)
|
||||
if m:
|
||||
s = s[m.end() :]
|
||||
result.append(m.group(1))
|
||||
continue
|
||||
|
||||
m = r_quot.match(s)
|
||||
if m:
|
||||
s = s[2:]
|
||||
result.append(_string_escape_map[m.group(1)])
|
||||
continue
|
||||
|
||||
m = r_uniquot.match(s)
|
||||
if m:
|
||||
s = s[m.end() :]
|
||||
u, U = m.groups() # noqa: N806
|
||||
codepoint = int(u or U, 16)
|
||||
if codepoint > 0x10FFFF:
|
||||
raise ParseError("Disallowed codepoint: %08X" % codepoint)
|
||||
result.append(chr(codepoint))
|
||||
elif s.startswith("\\"):
|
||||
raise ParseError("Illegal escape at: %s..." % s[:10])
|
||||
else:
|
||||
raise ParseError("Illegal literal character: %r" % s[0])
|
||||
return "".join(result)
|
||||
|
||||
|
||||
r_hibyte = re.compile(r"([\x80-\xFF])")
|
||||
|
||||
|
||||
def uriquote(uri: str) -> str:
|
||||
if not validate:
|
||||
return uri
|
||||
else:
|
||||
return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri)
|
||||
|
||||
|
||||
_BNodeContextType = MutableMapping[str, bNode]
|
||||
|
||||
|
||||
class W3CNTriplesParser:
|
||||
"""An N-Triples Parser.
|
||||
This is a legacy-style Triples parser for NTriples provided by W3C
|
||||
Usage::
|
||||
|
||||
p = W3CNTriplesParser(sink=MySink())
|
||||
sink = p.parse(f) # file; use parsestring for a string
|
||||
|
||||
To define a context in which blank node identifiers refer to the same blank node
|
||||
across instances of NTriplesParser, pass the same dict as ``bnode_context`` to each
|
||||
instance. By default, a new blank node context is created for each instance of
|
||||
`W3CNTriplesParser`.
|
||||
"""
|
||||
|
||||
__slots__ = ("_bnode_ids", "sink", "buffer", "file", "line", "skolemize")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sink: Optional[Union[DummySink, NTGraphSink]] = None,
|
||||
bnode_context: Optional[_BNodeContextType] = None,
|
||||
):
|
||||
self.skolemize = False
|
||||
|
||||
if bnode_context is not None:
|
||||
self._bnode_ids = bnode_context
|
||||
else:
|
||||
self._bnode_ids = {}
|
||||
|
||||
self.sink: Union[DummySink, NTGraphSink]
|
||||
if sink is not None:
|
||||
self.sink = sink
|
||||
else:
|
||||
self.sink = DummySink()
|
||||
|
||||
self.buffer: Optional[str] = None
|
||||
self.file: Optional[Union[TextIO, codecs.StreamReader]] = None
|
||||
self.line: Optional[str] = ""
|
||||
|
||||
def parse(
|
||||
self,
|
||||
f: Union[TextIO, IO[bytes], codecs.StreamReader],
|
||||
bnode_context: Optional[_BNodeContextType] = None,
|
||||
skolemize: bool = False,
|
||||
) -> Union[DummySink, NTGraphSink]:
|
||||
"""
|
||||
Parse f as an N-Triples file.
|
||||
|
||||
:type f: :term:`file object`
|
||||
:param f: the N-Triples source
|
||||
:type bnode_context: `dict`, optional
|
||||
:param bnode_context: a dict mapping blank node identifiers (e.g., ``a`` in ``_:a``)
|
||||
to `~rdflib.term.BNode` instances. An empty dict can be
|
||||
passed in to define a distinct context for a given call to
|
||||
`parse`.
|
||||
"""
|
||||
|
||||
if not hasattr(f, "read"):
|
||||
raise ParseError("Item to parse must be a file-like object.")
|
||||
|
||||
if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"):
|
||||
# someone still using a bytestream here?
|
||||
f = codecs.getreader("utf-8")(f)
|
||||
|
||||
self.skolemize = skolemize
|
||||
self.file = f # type: ignore[assignment]
|
||||
self.buffer = ""
|
||||
while True:
|
||||
self.line = self.readline()
|
||||
if self.line is None:
|
||||
break
|
||||
try:
|
||||
self.parseline(bnode_context=bnode_context)
|
||||
except ParseError:
|
||||
raise ParseError("Invalid line: {}".format(self.line))
|
||||
return self.sink
|
||||
|
||||
def parsestring(self, s: Union[bytes, bytearray, str], **kwargs) -> None:
|
||||
"""Parse s as an N-Triples string."""
|
||||
if not isinstance(s, (str, bytes, bytearray)):
|
||||
raise ParseError("Item to parse must be a string instance.")
|
||||
f: Union[codecs.StreamReader, StringIO]
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
f = codecs.getreader("utf-8")(BytesIO(s))
|
||||
else:
|
||||
f = StringIO(s)
|
||||
self.parse(f, **kwargs)
|
||||
|
||||
def readline(self) -> Optional[str]:
|
||||
"""Read an N-Triples line from buffered input."""
|
||||
# N-Triples lines end in either CRLF, CR, or LF
|
||||
# Therefore, we can't just use f.readline()
|
||||
if not self.buffer:
|
||||
# type error: Item "None" of "Union[TextIO, StreamReader, None]" has no attribute "read"
|
||||
buffer = self.file.read(bufsiz) # type: ignore[union-attr]
|
||||
if not buffer:
|
||||
return None
|
||||
self.buffer = buffer
|
||||
|
||||
while True:
|
||||
m = r_line.match(self.buffer)
|
||||
if m: # the more likely prospect
|
||||
self.buffer = self.buffer[m.end() :]
|
||||
return m.group(1)
|
||||
else:
|
||||
# type error: Item "None" of "Union[TextIO, StreamReader, None]" has no attribute "read"
|
||||
buffer = self.file.read(bufsiz) # type: ignore[union-attr]
|
||||
if not buffer and not self.buffer.isspace():
|
||||
# Last line does not need to be terminated with a newline
|
||||
buffer += "\n"
|
||||
elif not buffer:
|
||||
return None
|
||||
self.buffer += buffer
|
||||
|
||||
def parseline(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
|
||||
self.eat(r_wspace)
|
||||
if (not self.line) or self.line.startswith("#"):
|
||||
return # The line is empty or a comment
|
||||
|
||||
subject = self.subject(bnode_context)
|
||||
self.eat(r_wspaces)
|
||||
|
||||
predicate = self.predicate()
|
||||
self.eat(r_wspaces)
|
||||
|
||||
object_ = self.object(bnode_context)
|
||||
self.eat(r_tail)
|
||||
|
||||
if self.line:
|
||||
raise ParseError("Trailing garbage: {}".format(self.line))
|
||||
self.sink.triple(subject, predicate, object_)
|
||||
|
||||
def peek(self, token: str) -> bool:
|
||||
return self.line.startswith(token) # type: ignore[union-attr]
|
||||
|
||||
def eat(self, pattern: Pattern[str]) -> Match[str]:
|
||||
m = pattern.match(self.line) # type: ignore[arg-type]
|
||||
if not m: # @@ Why can't we get the original pattern?
|
||||
# print(dir(pattern))
|
||||
# print repr(self.line), type(self.line)
|
||||
raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
|
||||
self.line = self.line[m.end() :] # type: ignore[index]
|
||||
return m
|
||||
|
||||
def subject(self, bnode_context=None) -> Union[bNode, URIRef]:
|
||||
# @@ Consider using dictionary cases
|
||||
subj = self.uriref() or self.nodeid(bnode_context)
|
||||
if not subj:
|
||||
raise ParseError("Subject must be uriref or nodeID")
|
||||
return subj
|
||||
|
||||
def predicate(self) -> Union[bNode, URIRef]:
|
||||
pred = self.uriref()
|
||||
if not pred:
|
||||
raise ParseError("Predicate must be uriref")
|
||||
return pred
|
||||
|
||||
def object(
|
||||
self, bnode_context: Optional[_BNodeContextType] = None
|
||||
) -> Union[URI, bNode, Literal]:
|
||||
objt = self.uriref() or self.nodeid(bnode_context) or self.literal()
|
||||
if objt is False:
|
||||
raise ParseError("Unrecognised object type")
|
||||
return objt
|
||||
|
||||
def uriref(self) -> Union[te.Literal[False], URI]:
|
||||
if self.peek("<"):
|
||||
uri = self.eat(r_uriref).group(1)
|
||||
uri = unquote(uri)
|
||||
uri = uriquote(uri)
|
||||
return URI(uri)
|
||||
return False
|
||||
|
||||
def nodeid(
|
||||
self, bnode_context: Optional[_BNodeContextType] = None
|
||||
) -> Union[te.Literal[False], bNode, URI]:
|
||||
if self.peek("_"):
|
||||
if self.skolemize:
|
||||
bnode_id = self.eat(r_nodeid).group(1)
|
||||
return bNode(bnode_id).skolemize()
|
||||
|
||||
else:
|
||||
# Fix for https://github.com/RDFLib/rdflib/issues/204
|
||||
if bnode_context is None:
|
||||
bnode_context = self._bnode_ids
|
||||
bnode_id = self.eat(r_nodeid).group(1)
|
||||
new_id = bnode_context.get(bnode_id, None)
|
||||
if new_id is not None:
|
||||
# Re-map to id specific to this doc
|
||||
return bNode(new_id)
|
||||
else:
|
||||
# Replace with freshly-generated document-specific BNode id
|
||||
bnode = bNode()
|
||||
# Store the mapping
|
||||
bnode_context[bnode_id] = bnode
|
||||
return bnode
|
||||
return False
|
||||
|
||||
def literal(self) -> Union[te.Literal[False], Literal]:
|
||||
if self.peek('"'):
|
||||
lit, lang, dtype = self.eat(r_literal).groups()
|
||||
if lang:
|
||||
lang = lang
|
||||
else:
|
||||
lang = None
|
||||
if dtype:
|
||||
dtype = unquote(dtype)
|
||||
dtype = uriquote(dtype)
|
||||
dtype = URI(dtype)
|
||||
else:
|
||||
dtype = None
|
||||
if lang and dtype:
|
||||
raise ParseError("Can't have both a language and a datatype")
|
||||
lit = unquote(lit)
|
||||
return Literal(lit, lang, dtype)
|
||||
return False
|
||||
|
||||
|
||||
class NTGraphSink:
|
||||
__slots__ = ("g",)
|
||||
|
||||
def __init__(self, graph: Graph):
|
||||
self.g = graph
|
||||
|
||||
def triple(self, s: _SubjectType, p: _PredicateType, o: _ObjectType) -> None:
|
||||
self.g.add((s, p, o))
|
||||
|
||||
|
||||
class NTParser(Parser):
|
||||
"""parser for the ntriples format, often stored with the .nt extension
|
||||
|
||||
See http://www.w3.org/TR/rdf-testcases/#ntriples"""
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
@classmethod
|
||||
def parse(cls, source: InputSource, sink: Graph, **kwargs: Any) -> None:
|
||||
"""
|
||||
Parse the NT format
|
||||
|
||||
:type source: `rdflib.parser.InputSource`
|
||||
:param source: the source of NT-formatted data
|
||||
:type sink: `rdflib.graph.Graph`
|
||||
:param sink: where to send parsed triples
|
||||
:param kwargs: Additional arguments to pass to `.W3CNTriplesParser.parse`
|
||||
"""
|
||||
f: Union[TextIO, IO[bytes], codecs.StreamReader]
|
||||
f = source.getCharacterStream()
|
||||
if not f:
|
||||
b = source.getByteStream()
|
||||
# TextIOBase includes: StringIO and TextIOWrapper
|
||||
if isinstance(b, TextIOBase):
|
||||
# f is not really a ByteStream, but a CharacterStream
|
||||
f = b # type: ignore[assignment]
|
||||
else:
|
||||
# since N-Triples 1.1 files can and should be utf-8 encoded
|
||||
f = codecs.getreader("utf-8")(b)
|
||||
parser = W3CNTriplesParser(NTGraphSink(sink))
|
||||
parser.parse(f, **kwargs)
|
||||
f.close()
|
||||
@@ -0,0 +1,183 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from codecs import getreader
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any, MutableMapping, Optional, Union
|
||||
|
||||
from rdflib.exceptions import ParserError as ParseError
|
||||
from rdflib.graph import Dataset
|
||||
from rdflib.parser import InputSource
|
||||
from rdflib.plugins.parsers.nquads import NQuadsParser
|
||||
|
||||
# Build up from the NTriples parser:
|
||||
from rdflib.plugins.parsers.ntriples import r_nodeid, r_tail, r_uriref, r_wspace
|
||||
from rdflib.term import BNode, URIRef
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import typing_extensions as te
|
||||
|
||||
__all__ = ["RDFPatchParser", "Operation"]
|
||||
|
||||
_BNodeContextType = MutableMapping[str, BNode]
|
||||
|
||||
|
||||
class Operation(Enum):
|
||||
"""
|
||||
Enum of RDF Patch operations.
|
||||
|
||||
Operations:
|
||||
- `AddTripleOrQuad` (A): Adds a triple or quad.
|
||||
- `DeleteTripleOrQuad` (D): Deletes a triple or quad.
|
||||
- `AddPrefix` (PA): Adds a prefix.
|
||||
- `DeletePrefix` (PD): Deletes a prefix.
|
||||
- `TransactionStart` (TX): Starts a transaction.
|
||||
- `TransactionCommit` (TC): Commits a transaction.
|
||||
- `TransactionAbort` (TA): Aborts a transaction.
|
||||
- `Header` (H): Specifies a header.
|
||||
"""
|
||||
|
||||
AddTripleOrQuad = "A"
|
||||
DeleteTripleOrQuad = "D"
|
||||
AddPrefix = "PA"
|
||||
DeletePrefix = "PD"
|
||||
TransactionStart = "TX"
|
||||
TransactionCommit = "TC"
|
||||
TransactionAbort = "TA"
|
||||
Header = "H"
|
||||
|
||||
|
||||
class RDFPatchParser(NQuadsParser):
|
||||
def parse( # type: ignore[override]
|
||||
self,
|
||||
inputsource: InputSource,
|
||||
sink: Dataset,
|
||||
bnode_context: Optional[_BNodeContextType] = None,
|
||||
skolemize: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Dataset:
|
||||
"""
|
||||
Parse inputsource as an RDF Patch file.
|
||||
|
||||
:type inputsource: `rdflib.parser.InputSource`
|
||||
:param inputsource: the source of RDF Patch formatted data
|
||||
:type sink: `rdflib.graph.Dataset`
|
||||
:param sink: where to send parsed data
|
||||
:type bnode_context: `dict`, optional
|
||||
:param bnode_context: a dict mapping blank node identifiers to `~rdflib.term.BNode` instances.
|
||||
See `.W3CNTriplesParser.parse`
|
||||
"""
|
||||
assert sink.store.context_aware, (
|
||||
"RDFPatchParser must be given" " a context aware store."
|
||||
)
|
||||
# type error: Incompatible types in assignment (expression has type "ConjunctiveGraph", base class "W3CNTriplesParser" defined the type as "Union[DummySink, NTGraphSink]")
|
||||
self.sink: Dataset = Dataset(store=sink.store)
|
||||
self.skolemize = skolemize
|
||||
|
||||
source = inputsource.getCharacterStream()
|
||||
if not source:
|
||||
source = inputsource.getByteStream()
|
||||
source = getreader("utf-8")(source)
|
||||
|
||||
if not hasattr(source, "read"):
|
||||
raise ParseError("Item to parse must be a file-like object.")
|
||||
|
||||
self.file = source
|
||||
self.buffer = ""
|
||||
while True:
|
||||
self.line = __line = self.readline()
|
||||
if self.line is None:
|
||||
break
|
||||
try:
|
||||
self.parsepatch(bnode_context)
|
||||
except ParseError as msg:
|
||||
raise ParseError("Invalid line (%s):\n%r" % (msg, __line))
|
||||
return self.sink
|
||||
|
||||
def parsepatch(self, bnode_context: Optional[_BNodeContextType] = None) -> None:
|
||||
self.eat(r_wspace)
|
||||
# From spec: "No comments should be included (comments start # and run to end
|
||||
# of line)."
|
||||
if (not self.line) or self.line.startswith("#"):
|
||||
return # The line is empty or a comment
|
||||
|
||||
# if header, transaction, skip
|
||||
operation = self.operation()
|
||||
self.eat(r_wspace)
|
||||
|
||||
if operation in [Operation.AddTripleOrQuad, Operation.DeleteTripleOrQuad]:
|
||||
self.add_or_remove_triple_or_quad(operation, bnode_context)
|
||||
elif operation == Operation.AddPrefix:
|
||||
self.add_prefix()
|
||||
elif operation == Operation.DeletePrefix:
|
||||
self.delete_prefix()
|
||||
|
||||
def add_or_remove_triple_or_quad(
|
||||
self, operation, bnode_context: Optional[_BNodeContextType] = None
|
||||
) -> None:
|
||||
self.eat(r_wspace)
|
||||
if (not self.line) or self.line.startswith("#"):
|
||||
return # The line is empty or a comment
|
||||
|
||||
subject = self.labeled_bnode() or self.subject(bnode_context)
|
||||
self.eat(r_wspace)
|
||||
|
||||
predicate = self.predicate()
|
||||
self.eat(r_wspace)
|
||||
|
||||
obj = self.labeled_bnode() or self.object(bnode_context)
|
||||
self.eat(r_wspace)
|
||||
|
||||
context = self.labeled_bnode() or self.uriref() or self.nodeid(bnode_context)
|
||||
self.eat(r_tail)
|
||||
|
||||
if self.line:
|
||||
raise ParseError("Trailing garbage")
|
||||
# Must have a context aware store - add on a normal Graph
|
||||
# discards anything where the ctx != graph.identifier
|
||||
if operation == Operation.AddTripleOrQuad:
|
||||
if context:
|
||||
self.sink.get_context(context).add((subject, predicate, obj))
|
||||
else:
|
||||
self.sink.default_context.add((subject, predicate, obj))
|
||||
elif operation == Operation.DeleteTripleOrQuad:
|
||||
if context:
|
||||
self.sink.get_context(context).remove((subject, predicate, obj))
|
||||
else:
|
||||
self.sink.default_context.remove((subject, predicate, obj))
|
||||
|
||||
def add_prefix(self):
|
||||
# Extract prefix and URI from the line
|
||||
prefix, ns, _ = self.line.replace('"', "").replace("'", "").split(" ") # type: ignore[union-attr]
|
||||
ns_stripped = ns.strip("<>")
|
||||
self.sink.bind(prefix, ns_stripped)
|
||||
|
||||
def delete_prefix(self):
|
||||
prefix, _, _ = self.line.replace('"', "").replace("'", "").split(" ") # type: ignore[union-attr]
|
||||
self.sink.namespace_manager.bind(prefix, None, replace=True)
|
||||
|
||||
def operation(self) -> Operation:
|
||||
for op in Operation:
|
||||
if self.line.startswith(op.value): # type: ignore[union-attr]
|
||||
self.eat_op(op.value)
|
||||
return op
|
||||
raise ValueError(
|
||||
f'Invalid or no Operation found in line: "{self.line}". Valid Operations '
|
||||
f"codes are {', '.join([op.value for op in Operation])}"
|
||||
)
|
||||
|
||||
def eat_op(self, op: str) -> None:
|
||||
self.line = self.line.lstrip(op) # type: ignore[union-attr]
|
||||
|
||||
def nodeid(
|
||||
self, bnode_context: Optional[_BNodeContextType] = None
|
||||
) -> Union[te.Literal[False], BNode, URIRef]:
|
||||
if self.peek("_"):
|
||||
return BNode(self.eat(r_nodeid).group(1))
|
||||
return False
|
||||
|
||||
def labeled_bnode(self):
|
||||
if self.peek("<_"):
|
||||
plain_uri = self.eat(r_uriref).group(1)
|
||||
bnode_id = r_nodeid.match(plain_uri).group(1) # type: ignore[union-attr]
|
||||
return BNode(bnode_id)
|
||||
return False
|
||||
@@ -0,0 +1,651 @@
|
||||
"""
|
||||
An RDF/XML parser for RDFLib
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
|
||||
from urllib.parse import urldefrag, urljoin
|
||||
from xml.sax import handler, make_parser, xmlreader
|
||||
from xml.sax.handler import ErrorHandler
|
||||
from xml.sax.saxutils import escape, quoteattr
|
||||
|
||||
from rdflib.exceptions import Error, ParserError
|
||||
from rdflib.graph import Graph
|
||||
from rdflib.namespace import RDF, is_ncname
|
||||
from rdflib.parser import InputSource, Parser
|
||||
from rdflib.plugins.parsers.RDFVOC import RDFVOC
|
||||
from rdflib.term import BNode, Identifier, Literal, URIRef
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# from xml.sax.expatreader import ExpatLocator
|
||||
from xml.sax.xmlreader import AttributesImpl, Locator
|
||||
|
||||
from rdflib.graph import _ObjectType, _SubjectType, _TripleType
|
||||
|
||||
__all__ = ["create_parser", "BagID", "ElementHandler", "RDFXMLHandler", "RDFXMLParser"]
|
||||
|
||||
RDFNS = RDFVOC
|
||||
|
||||
# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
|
||||
# A mapping from unqualified terms to their qualified version.
|
||||
UNQUALIFIED = {
|
||||
"about": RDFVOC.about,
|
||||
"ID": RDFVOC.ID,
|
||||
"type": RDFVOC.type,
|
||||
"resource": RDFVOC.resource,
|
||||
"parseType": RDFVOC.parseType,
|
||||
}
|
||||
|
||||
# http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms
|
||||
CORE_SYNTAX_TERMS = [
|
||||
RDFVOC.RDF,
|
||||
RDFVOC.ID,
|
||||
RDFVOC.about,
|
||||
RDFVOC.parseType,
|
||||
RDFVOC.resource,
|
||||
RDFVOC.nodeID,
|
||||
RDFVOC.datatype,
|
||||
]
|
||||
|
||||
# http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms
|
||||
SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDFVOC.Description, RDFVOC.li]
|
||||
|
||||
# http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms
|
||||
OLD_TERMS = [
|
||||
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"),
|
||||
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"),
|
||||
URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID"),
|
||||
]
|
||||
|
||||
NODE_ELEMENT_EXCEPTIONS = (
|
||||
CORE_SYNTAX_TERMS
|
||||
+ [
|
||||
RDFVOC.li,
|
||||
]
|
||||
+ OLD_TERMS
|
||||
)
|
||||
NODE_ELEMENT_ATTRIBUTES = [RDFVOC.ID, RDFVOC.nodeID, RDFVOC.about]
|
||||
|
||||
PROPERTY_ELEMENT_EXCEPTIONS = (
|
||||
CORE_SYNTAX_TERMS
|
||||
+ [
|
||||
RDFVOC.Description,
|
||||
]
|
||||
+ OLD_TERMS
|
||||
)
|
||||
PROPERTY_ATTRIBUTE_EXCEPTIONS = (
|
||||
CORE_SYNTAX_TERMS + [RDFVOC.Description, RDFVOC.li] + OLD_TERMS
|
||||
)
|
||||
PROPERTY_ELEMENT_ATTRIBUTES = [RDFVOC.ID, RDFVOC.resource, RDFVOC.nodeID]
|
||||
|
||||
XMLNS = "http://www.w3.org/XML/1998/namespace"
|
||||
BASE = (XMLNS, "base")
|
||||
LANG = (XMLNS, "lang")
|
||||
|
||||
|
||||
class BagID(URIRef):
|
||||
__slots__ = ["li"]
|
||||
|
||||
def __init__(self, val):
|
||||
# type error: Too many arguments for "__init__" of "object"
|
||||
super(URIRef, self).__init__(val) # type: ignore[call-arg]
|
||||
self.li = 0
|
||||
|
||||
def next_li(self):
|
||||
self.li += 1
|
||||
# type error: Type expected within [...]
|
||||
return RDFNS["_%s" % self.li] # type: ignore[misc]
|
||||
|
||||
|
||||
class ElementHandler:
|
||||
__slots__ = [
|
||||
"start",
|
||||
"char",
|
||||
"end",
|
||||
"li",
|
||||
"id",
|
||||
"base",
|
||||
"subject",
|
||||
"predicate",
|
||||
"object",
|
||||
"list",
|
||||
"language",
|
||||
"datatype",
|
||||
"declared",
|
||||
"data",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self.start = None
|
||||
self.char = None
|
||||
self.end = None
|
||||
self.li = 0
|
||||
self.id = None
|
||||
self.base = None
|
||||
self.subject = None
|
||||
self.object = None
|
||||
self.list = None
|
||||
self.language = None
|
||||
self.datatype = None
|
||||
self.declared = None
|
||||
self.data = None
|
||||
|
||||
def next_li(self):
|
||||
self.li += 1
|
||||
return RDFVOC["_%s" % self.li]
|
||||
|
||||
|
||||
class RDFXMLHandler(handler.ContentHandler):
|
||||
def __init__(self, store: Graph):
|
||||
self.store = store
|
||||
self.preserve_bnode_ids = False
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
document_element = ElementHandler()
|
||||
document_element.start = self.document_element_start
|
||||
document_element.end = lambda name, qname: None
|
||||
self.stack: List[Optional[ElementHandler]] = [
|
||||
None,
|
||||
document_element,
|
||||
]
|
||||
self.ids: Dict[str, int] = {} # remember IDs we have already seen
|
||||
self.bnode: Dict[str, Identifier] = {}
|
||||
self._ns_contexts: List[Dict[str, Optional[str]]] = [
|
||||
{}
|
||||
] # contains uri -> prefix dicts
|
||||
self._current_context: Dict[str, Optional[str]] = self._ns_contexts[-1]
|
||||
|
||||
# ContentHandler methods
|
||||
|
||||
def setDocumentLocator(self, locator: Locator):
|
||||
self.locator = locator
|
||||
|
||||
def startDocument(self) -> None:
|
||||
pass
|
||||
|
||||
def startPrefixMapping(self, prefix: Optional[str], namespace: str) -> None:
|
||||
self._ns_contexts.append(self._current_context.copy())
|
||||
self._current_context[namespace] = prefix
|
||||
self.store.bind(prefix, namespace or "", override=False)
|
||||
|
||||
def endPrefixMapping(self, prefix: Optional[str]) -> None:
|
||||
self._current_context = self._ns_contexts[-1]
|
||||
del self._ns_contexts[-1]
|
||||
|
||||
def startElementNS(
|
||||
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
|
||||
) -> None:
|
||||
stack = self.stack
|
||||
stack.append(ElementHandler())
|
||||
current = self.current
|
||||
parent = self.parent
|
||||
# type error: No overlaod for "get" of "AttributesImpl" mactches tuple (str, str)
|
||||
base = attrs.get(BASE, None) # type: ignore[call-overload, unused-ignore]
|
||||
if base is not None:
|
||||
base, frag = urldefrag(base)
|
||||
if parent and parent.base:
|
||||
base = urljoin(parent.base, base)
|
||||
else:
|
||||
systemId = self.locator.getPublicId() or self.locator.getSystemId()
|
||||
if systemId:
|
||||
base = urljoin(systemId, base)
|
||||
else:
|
||||
if parent:
|
||||
base = parent.base
|
||||
if base is None:
|
||||
systemId = self.locator.getPublicId() or self.locator.getSystemId()
|
||||
if systemId:
|
||||
base, frag = urldefrag(systemId)
|
||||
current.base = base
|
||||
# type error: No overlaod for "get" of "AttributesImpl" mactches tuple (str, str)
|
||||
language = attrs.get(LANG, None) # type: ignore[call-overload, unused-ignore]
|
||||
if language is None:
|
||||
if parent:
|
||||
language = parent.language
|
||||
current.language = language
|
||||
current.start(name, qname, attrs)
|
||||
|
||||
def endElementNS(self, name: Tuple[Optional[str], str], qname) -> None:
|
||||
self.current.end(name, qname)
|
||||
self.stack.pop()
|
||||
|
||||
def characters(self, content: str) -> None:
|
||||
char = self.current.char
|
||||
if char:
|
||||
char(content)
|
||||
|
||||
def ignorableWhitespace(self, content) -> None:
|
||||
pass
|
||||
|
||||
def processingInstruction(self, target, data) -> None:
|
||||
pass
|
||||
|
||||
def add_reified(self, sid: Identifier, spo: _TripleType):
|
||||
s, p, o = spo
|
||||
self.store.add((sid, RDF.type, RDF.Statement))
|
||||
self.store.add((sid, RDF.subject, s))
|
||||
self.store.add((sid, RDF.predicate, p))
|
||||
self.store.add((sid, RDF.object, o))
|
||||
|
||||
def error(self, message: str) -> NoReturn:
|
||||
locator = self.locator
|
||||
info = "%s:%s:%s: " % (
|
||||
locator.getSystemId(),
|
||||
locator.getLineNumber(),
|
||||
locator.getColumnNumber(),
|
||||
)
|
||||
raise ParserError(info + message)
|
||||
|
||||
def get_current(self) -> Optional[ElementHandler]:
|
||||
return self.stack[-2]
|
||||
|
||||
# Create a read only property called current so that self.current
|
||||
# give the current element handler.
|
||||
current = property(get_current)
|
||||
|
||||
def get_next(self) -> Optional[ElementHandler]:
|
||||
return self.stack[-1]
|
||||
|
||||
# Create a read only property that gives the element handler to be
|
||||
# used for the next element.
|
||||
next = property(get_next)
|
||||
|
||||
def get_parent(self) -> Optional[ElementHandler]:
|
||||
return self.stack[-3]
|
||||
|
||||
# Create a read only property that gives the current parent
|
||||
# element handler
|
||||
parent = property(get_parent)
|
||||
|
||||
def absolutize(self, uri: str) -> URIRef:
|
||||
# type error: Argument "allow_fragments" to "urljoin" has incompatible type "int"; expected "bool"
|
||||
result = urljoin(self.current.base, uri, allow_fragments=1) # type: ignore[arg-type]
|
||||
if uri and uri[-1] == "#" and result[-1] != "#":
|
||||
result = "%s#" % result
|
||||
return URIRef(result)
|
||||
|
||||
def convert(
|
||||
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
|
||||
) -> Tuple[URIRef, Dict[URIRef, str]]:
|
||||
if name[0] is None:
|
||||
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[Optional[str], str]")
|
||||
name = URIRef(name[1]) # type: ignore[assignment]
|
||||
else:
|
||||
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[Optional[str], str]")
|
||||
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Optional[str], str]"; expected "Iterable[str]"
|
||||
name = URIRef("".join(name)) # type: ignore[assignment, arg-type]
|
||||
atts = {}
|
||||
for n, v in attrs.items():
|
||||
# mypy error: mypy thinks n[0]==None is unreachable
|
||||
if n[0] is None:
|
||||
att = n[1] # type: ignore[unreachable, unused-ignore]
|
||||
else:
|
||||
att = "".join(n)
|
||||
if att.startswith(XMLNS) or att[0:3].lower() == "xml":
|
||||
pass
|
||||
elif att in UNQUALIFIED:
|
||||
# if not RDFNS[att] in atts:
|
||||
# type error: Variable "att" is not valid as a type
|
||||
atts[RDFNS[att]] = v # type: ignore[misc, valid-type]
|
||||
else:
|
||||
atts[URIRef(att)] = v
|
||||
# type error: Incompatible return value type (got "Tuple[Tuple[Optional[str], str], Dict[Any, Any]]", expected "Tuple[URIRef, Dict[URIRef, str]]")
|
||||
return name, atts # type: ignore[return-value]
|
||||
|
||||
def document_element_start(
|
||||
self, name: Tuple[str, str], qname, attrs: AttributesImpl
|
||||
) -> None:
|
||||
if name[0] and URIRef("".join(name)) == RDFVOC.RDF:
|
||||
next = self.next
|
||||
next.start = self.node_element_start
|
||||
next.end = self.node_element_end
|
||||
else:
|
||||
self.node_element_start(name, qname, attrs)
|
||||
# self.current.end = self.node_element_end
|
||||
# TODO... set end to something that sets start such that
|
||||
# another element will cause error
|
||||
|
||||
def node_element_start(
|
||||
self, name: Tuple[str, str], qname, attrs: AttributesImpl
|
||||
) -> None:
|
||||
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[str, str]")
|
||||
name, atts = self.convert(name, qname, attrs) # type: ignore[assignment]
|
||||
current = self.current
|
||||
absolutize = self.absolutize
|
||||
|
||||
next = self.next
|
||||
next.start = self.property_element_start
|
||||
next.end = self.property_element_end
|
||||
|
||||
if name in NODE_ELEMENT_EXCEPTIONS:
|
||||
# type error: Not all arguments converted during string formatting
|
||||
self.error("Invalid node element URI: %s" % name) # type: ignore[str-format]
|
||||
subject: _SubjectType
|
||||
if RDFVOC.ID in atts:
|
||||
if RDFVOC.about in atts or RDFVOC.nodeID in atts:
|
||||
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
|
||||
|
||||
id = atts[RDFVOC.ID]
|
||||
if not is_ncname(id):
|
||||
self.error("rdf:ID value is not a valid NCName: %s" % id)
|
||||
subject = absolutize("#%s" % id)
|
||||
if subject in self.ids:
|
||||
self.error("two elements cannot use the same ID: '%s'" % subject)
|
||||
self.ids[subject] = 1 # IDs can only appear once within a document
|
||||
elif RDFVOC.nodeID in atts:
|
||||
if RDFVOC.ID in atts or RDFVOC.about in atts:
|
||||
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
|
||||
nodeID = atts[RDFVOC.nodeID]
|
||||
if not is_ncname(nodeID):
|
||||
self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID)
|
||||
if self.preserve_bnode_ids is False:
|
||||
if nodeID in self.bnode:
|
||||
subject = self.bnode[nodeID]
|
||||
else:
|
||||
subject = BNode()
|
||||
self.bnode[nodeID] = subject
|
||||
else:
|
||||
subject = BNode(nodeID)
|
||||
elif RDFVOC.about in atts:
|
||||
if RDFVOC.ID in atts or RDFVOC.nodeID in atts:
|
||||
self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID")
|
||||
subject = absolutize(atts[RDFVOC.about])
|
||||
else:
|
||||
subject = BNode()
|
||||
|
||||
if name != RDFVOC.Description: # S1
|
||||
# error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
|
||||
self.store.add((subject, RDF.type, absolutize(name))) # type: ignore[arg-type]
|
||||
|
||||
object: _ObjectType
|
||||
language = current.language
|
||||
for att in atts:
|
||||
if not att.startswith(str(RDFNS)):
|
||||
predicate = absolutize(att)
|
||||
try:
|
||||
object = Literal(atts[att], language)
|
||||
except Error as e:
|
||||
# type error: Argument 1 to "error" of "RDFXMLHandler" has incompatible type "Optional[str]"; expected "str"
|
||||
self.error(e.msg) # type: ignore[arg-type]
|
||||
elif att == RDF.type: # S2
|
||||
predicate = RDF.type
|
||||
object = absolutize(atts[RDF.type])
|
||||
elif att in NODE_ELEMENT_ATTRIBUTES:
|
||||
continue
|
||||
elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: # S3
|
||||
self.error("Invalid property attribute URI: %s" % att)
|
||||
# type error: Statement is unreachable
|
||||
continue # type: ignore[unreachable] # for when error does not throw an exception
|
||||
else:
|
||||
predicate = absolutize(att)
|
||||
try:
|
||||
object = Literal(atts[att], language)
|
||||
except Error as e:
|
||||
# type error: Argument 1 to "error" of "RDFXMLHandler" has incompatible type "Optional[str]"; expected "str"
|
||||
self.error(e.msg) # type: ignore[arg-type]
|
||||
self.store.add((subject, predicate, object))
|
||||
|
||||
current.subject = subject
|
||||
|
||||
def node_element_end(self, name: Tuple[str, str], qname) -> None:
|
||||
# repeat node-elements are only allowed
|
||||
# at at top-level
|
||||
|
||||
if self.parent.object and self.current != self.stack[2]:
|
||||
self.error(
|
||||
"Repeat node-elements inside property elements: %s" % "".join(name)
|
||||
)
|
||||
|
||||
self.parent.object = self.current.subject
|
||||
|
||||
def property_element_start(
|
||||
self, name: Tuple[str, str], qname, attrs: AttributesImpl
|
||||
) -> None:
|
||||
# type error: Incompatible types in assignment (expression has type "URIRef", variable has type "Tuple[str, str]")
|
||||
name, atts = self.convert(name, qname, attrs) # type: ignore[assignment]
|
||||
current = self.current
|
||||
absolutize = self.absolutize
|
||||
|
||||
next = self.next
|
||||
object: Optional[_ObjectType] = None
|
||||
current.data = None
|
||||
current.list = None
|
||||
|
||||
# type error: "Tuple[str, str]" has no attribute "startswith"
|
||||
if not name.startswith(str(RDFNS)): # type: ignore[attr-defined]
|
||||
# type error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
|
||||
current.predicate = absolutize(name) # type: ignore[arg-type]
|
||||
elif name == RDFVOC.li:
|
||||
current.predicate = current.next_li()
|
||||
elif name in PROPERTY_ELEMENT_EXCEPTIONS:
|
||||
# type error: Not all arguments converted during string formatting
|
||||
self.error("Invalid property element URI: %s" % name) # type: ignore[str-format]
|
||||
else:
|
||||
# type error: Argument 1 has incompatible type "Tuple[str, str]"; expected "str"
|
||||
current.predicate = absolutize(name) # type: ignore[arg-type]
|
||||
|
||||
id = atts.get(RDFVOC.ID, None)
|
||||
if id is not None:
|
||||
if not is_ncname(id):
|
||||
self.error("rdf:ID value is not a value NCName: %s" % id)
|
||||
current.id = absolutize("#%s" % id)
|
||||
else:
|
||||
current.id = None
|
||||
|
||||
resource = atts.get(RDFVOC.resource, None)
|
||||
nodeID = atts.get(RDFVOC.nodeID, None)
|
||||
parse_type = atts.get(RDFVOC.parseType, None)
|
||||
if resource is not None and nodeID is not None:
|
||||
self.error("Property element cannot have both rdf:nodeID and rdf:resource")
|
||||
if resource is not None:
|
||||
object = absolutize(resource)
|
||||
next.start = self.node_element_start
|
||||
next.end = self.node_element_end
|
||||
elif nodeID is not None:
|
||||
if not is_ncname(nodeID):
|
||||
self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID)
|
||||
if self.preserve_bnode_ids is False:
|
||||
if nodeID in self.bnode:
|
||||
object = self.bnode[nodeID]
|
||||
else:
|
||||
subject = BNode()
|
||||
self.bnode[nodeID] = subject
|
||||
object = subject
|
||||
else:
|
||||
object = subject = BNode(nodeID)
|
||||
next.start = self.node_element_start
|
||||
next.end = self.node_element_end
|
||||
else:
|
||||
if parse_type is not None:
|
||||
for att in atts:
|
||||
if att != RDFVOC.parseType and att != RDFVOC.ID:
|
||||
self.error("Property attr '%s' now allowed here" % att)
|
||||
if parse_type == "Resource":
|
||||
current.subject = object = BNode()
|
||||
current.char = self.property_element_char
|
||||
next.start = self.property_element_start
|
||||
next.end = self.property_element_end
|
||||
elif parse_type == "Collection":
|
||||
current.char = None
|
||||
object = current.list = RDF.nil # BNode()
|
||||
# self.parent.subject
|
||||
next.start = self.node_element_start
|
||||
next.end = self.list_node_element_end
|
||||
else: # if parse_type=="Literal":
|
||||
# All other values are treated as Literal
|
||||
# See: http://www.w3.org/TR/rdf-syntax-grammar/
|
||||
# parseTypeOtherPropertyElt
|
||||
object = Literal("", datatype=RDFVOC.XMLLiteral)
|
||||
current.char = self.literal_element_char
|
||||
current.declared = {XMLNS: "xml"}
|
||||
next.start = self.literal_element_start
|
||||
next.char = self.literal_element_char
|
||||
next.end = self.literal_element_end
|
||||
current.object = object
|
||||
return
|
||||
else:
|
||||
object = None
|
||||
current.char = self.property_element_char
|
||||
next.start = self.node_element_start
|
||||
next.end = self.node_element_end
|
||||
|
||||
datatype = current.datatype = atts.get(RDFVOC.datatype, None)
|
||||
language = current.language
|
||||
if datatype is not None:
|
||||
# TODO: check that there are no atts other than datatype and id
|
||||
datatype = absolutize(datatype)
|
||||
else:
|
||||
for att in atts:
|
||||
if not att.startswith(str(RDFNS)):
|
||||
predicate = absolutize(att)
|
||||
elif att in PROPERTY_ELEMENT_ATTRIBUTES:
|
||||
continue
|
||||
elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS:
|
||||
self.error("""Invalid property attribute URI: %s""" % att)
|
||||
else:
|
||||
predicate = absolutize(att)
|
||||
o: _ObjectType
|
||||
if att == RDF.type:
|
||||
o = URIRef(atts[att])
|
||||
else:
|
||||
if datatype is not None:
|
||||
# type error: Statement is unreachable
|
||||
language = None # type: ignore[unreachable]
|
||||
o = Literal(atts[att], language, datatype)
|
||||
|
||||
if object is None:
|
||||
object = BNode()
|
||||
self.store.add((object, predicate, o))
|
||||
if object is None:
|
||||
current.data = ""
|
||||
current.object = None
|
||||
else:
|
||||
current.data = None
|
||||
current.object = object
|
||||
|
||||
def property_element_char(self, data: str) -> None:
|
||||
current = self.current
|
||||
if current.data is not None:
|
||||
current.data += data
|
||||
|
||||
def property_element_end(self, name: Tuple[str, str], qname) -> None:
|
||||
current = self.current
|
||||
if current.data is not None and current.object is None:
|
||||
literalLang = current.language
|
||||
if current.datatype is not None:
|
||||
literalLang = None
|
||||
current.object = Literal(current.data, literalLang, current.datatype)
|
||||
current.data = None
|
||||
if self.next.end == self.list_node_element_end:
|
||||
if current.object != RDF.nil:
|
||||
self.store.add((current.list, RDF.rest, RDF.nil))
|
||||
if current.object is not None:
|
||||
self.store.add((self.parent.subject, current.predicate, current.object))
|
||||
if current.id is not None:
|
||||
self.add_reified(
|
||||
current.id, (self.parent.subject, current.predicate, current.object)
|
||||
)
|
||||
current.subject = None
|
||||
|
||||
def list_node_element_end(self, name: Tuple[str, str], qname) -> None:
|
||||
current = self.current
|
||||
if self.parent.list == RDF.nil:
|
||||
list = BNode()
|
||||
# Removed between 20030123 and 20030905
|
||||
# self.store.add((list, RDF.type, LIST))
|
||||
self.parent.list = list
|
||||
self.store.add((self.parent.list, RDF.first, current.subject))
|
||||
self.parent.object = list
|
||||
self.parent.char = None
|
||||
else:
|
||||
list = BNode()
|
||||
# Removed between 20030123 and 20030905
|
||||
# self.store.add((list, RDF.type, LIST))
|
||||
self.store.add((self.parent.list, RDF.rest, list))
|
||||
self.store.add((list, RDF.first, current.subject))
|
||||
self.parent.list = list
|
||||
|
||||
def literal_element_start(
|
||||
self, name: Tuple[str, str], qname, attrs: AttributesImpl
|
||||
) -> None:
|
||||
current = self.current
|
||||
self.next.start = self.literal_element_start
|
||||
self.next.char = self.literal_element_char
|
||||
self.next.end = self.literal_element_end
|
||||
current.declared = self.parent.declared.copy()
|
||||
if name[0]:
|
||||
prefix = self._current_context[name[0]]
|
||||
if prefix:
|
||||
current.object = "<%s:%s" % (prefix, name[1])
|
||||
else:
|
||||
current.object = "<%s" % name[1]
|
||||
if not name[0] in current.declared: # noqa: E713
|
||||
current.declared[name[0]] = prefix
|
||||
if prefix:
|
||||
current.object += ' xmlns:%s="%s"' % (prefix, name[0])
|
||||
else:
|
||||
current.object += ' xmlns="%s"' % name[0]
|
||||
else:
|
||||
current.object = "<%s" % name[1]
|
||||
# type error: Incompatible types in assignment (expression has type "str", variable has type "Tuple[str, str]")
|
||||
for name, value in attrs.items(): # type: ignore[assignment, unused-ignore]
|
||||
if name[0]:
|
||||
if not name[0] in current.declared: # noqa: E713
|
||||
current.declared[name[0]] = self._current_context[name[0]]
|
||||
name = current.declared[name[0]] + ":" + name[1]
|
||||
else:
|
||||
# type error: Incompatible types in assignment (expression has type "str", variable has type "Tuple[str, str]")
|
||||
name = name[1] # type: ignore[assignment]
|
||||
current.object += " %s=%s" % (name, quoteattr(value))
|
||||
current.object += ">"
|
||||
|
||||
def literal_element_char(self, data: str) -> None:
|
||||
self.current.object += escape(data)
|
||||
|
||||
def literal_element_end(self, name: Tuple[str, str], qname) -> None:
|
||||
if name[0]:
|
||||
prefix = self._current_context[name[0]]
|
||||
if prefix:
|
||||
end = "</%s:%s>" % (prefix, name[1])
|
||||
else:
|
||||
end = "</%s>" % name[1]
|
||||
else:
|
||||
end = "</%s>" % name[1]
|
||||
self.parent.object += self.current.object + end
|
||||
|
||||
|
||||
def create_parser(target: InputSource, store: Graph) -> xmlreader.XMLReader:
|
||||
parser = make_parser()
|
||||
try:
|
||||
# Workaround for bug in expatreader.py. Needed when
|
||||
# expatreader is trying to guess a prefix.
|
||||
parser.start_namespace_decl("xml", "http://www.w3.org/XML/1998/namespace") # type: ignore[attr-defined]
|
||||
except AttributeError:
|
||||
pass # Not present in Jython (at least)
|
||||
parser.setFeature(handler.feature_namespaces, 1)
|
||||
rdfxml = RDFXMLHandler(store)
|
||||
# type error: Argument 1 to "setDocumentLocator" of "RDFXMLHandler" has incompatible type "InputSource"; expected "Locator"
|
||||
rdfxml.setDocumentLocator(target) # type: ignore[arg-type]
|
||||
# rdfxml.setDocumentLocator(_Locator(self.url, self.parser))
|
||||
parser.setContentHandler(rdfxml)
|
||||
parser.setErrorHandler(ErrorHandler())
|
||||
return parser
|
||||
|
||||
|
||||
class RDFXMLParser(Parser):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def parse(self, source: InputSource, sink: Graph, **args: Any) -> None:
|
||||
self._parser = create_parser(source, sink)
|
||||
content_handler = self._parser.getContentHandler()
|
||||
preserve_bnode_ids = args.get("preserve_bnode_ids", None)
|
||||
if preserve_bnode_ids is not None:
|
||||
# type error: ContentHandler has no attribute "preserve_bnode_ids"
|
||||
content_handler.preserve_bnode_ids = preserve_bnode_ids # type: ignore[attr-defined, unused-ignore]
|
||||
# # We're only using it once now
|
||||
# content_handler.reset()
|
||||
# self._parser.reset()
|
||||
self._parser.parse(source)
|
||||
@@ -0,0 +1,177 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, MutableSequence
|
||||
|
||||
from rdflib.graph import ConjunctiveGraph, Graph
|
||||
from rdflib.parser import InputSource, Parser
|
||||
|
||||
from .notation3 import RDFSink, SinkParser
|
||||
|
||||
|
||||
def becauseSubGraph(*args, **kwargs): # noqa: N802
|
||||
pass
|
||||
|
||||
|
||||
class TrigSinkParser(SinkParser):
|
||||
def directiveOrStatement(self, argstr: str, h: int) -> int: # noqa: N802
|
||||
# import pdb; pdb.set_trace()
|
||||
|
||||
i = self.skipSpace(argstr, h)
|
||||
if i < 0:
|
||||
return i # EOF
|
||||
|
||||
j = self.graph(argstr, i)
|
||||
if j >= 0:
|
||||
return j
|
||||
|
||||
j = self.sparqlDirective(argstr, i)
|
||||
if j >= 0:
|
||||
return j
|
||||
|
||||
j = self.directive(argstr, i)
|
||||
if j >= 0:
|
||||
return self.checkDot(argstr, j)
|
||||
|
||||
j = self.statement(argstr, i)
|
||||
if j >= 0:
|
||||
return self.checkDot(argstr, j)
|
||||
|
||||
return j
|
||||
|
||||
def labelOrSubject( # noqa: N802
|
||||
self, argstr: str, i: int, res: MutableSequence[Any]
|
||||
) -> int:
|
||||
j = self.skipSpace(argstr, i)
|
||||
if j < 0:
|
||||
return j # eof
|
||||
i = j
|
||||
|
||||
j = self.uri_ref2(argstr, i, res)
|
||||
if j >= 0:
|
||||
return j
|
||||
|
||||
if argstr[i] == "[":
|
||||
j = self.skipSpace(argstr, i + 1)
|
||||
if j < 0:
|
||||
self.BadSyntax(argstr, i, "Expected ] got EOF")
|
||||
if argstr[j] == "]":
|
||||
res.append(self.blankNode())
|
||||
return j + 1
|
||||
return -1
|
||||
|
||||
def graph(self, argstr: str, i: int) -> int:
|
||||
"""
|
||||
Parse trig graph, i.e.
|
||||
|
||||
<urn:graphname> = { .. triples .. }
|
||||
|
||||
return -1 if it doesn't look like a graph-decl
|
||||
raise Exception if it looks like a graph, but isn't.
|
||||
"""
|
||||
|
||||
need_graphid = False
|
||||
# import pdb; pdb.set_trace()
|
||||
j = self.sparqlTok("GRAPH", argstr, i) # optional GRAPH keyword
|
||||
if j >= 0:
|
||||
i = j
|
||||
need_graphid = True
|
||||
|
||||
r: MutableSequence[Any] = []
|
||||
j = self.labelOrSubject(argstr, i, r)
|
||||
if j >= 0:
|
||||
graph = r[0]
|
||||
i = j
|
||||
elif need_graphid:
|
||||
self.BadSyntax(argstr, i, "GRAPH keyword must be followed by graph name")
|
||||
else:
|
||||
graph = self._store.graph.identifier # hack
|
||||
|
||||
j = self.skipSpace(argstr, i)
|
||||
if j < 0:
|
||||
self.BadSyntax(argstr, i, "EOF found when expected graph")
|
||||
|
||||
if argstr[j : j + 1] == "=": # optional = for legacy support
|
||||
i = self.skipSpace(argstr, j + 1)
|
||||
if i < 0:
|
||||
self.BadSyntax(argstr, i, "EOF found when expecting '{'")
|
||||
else:
|
||||
i = j
|
||||
|
||||
if argstr[i : i + 1] != "{":
|
||||
return -1 # the node wasn't part of a graph
|
||||
|
||||
j = i + 1
|
||||
|
||||
if self._context is not None:
|
||||
self.BadSyntax(argstr, i, "Nested graphs are not allowed")
|
||||
|
||||
oldParentContext = self._parentContext # noqa: N806
|
||||
self._parentContext = self._context
|
||||
reason2 = self._reason2
|
||||
self._reason2 = becauseSubGraph
|
||||
# type error: Incompatible types in assignment (expression has type "Graph", variable has type "Optional[Formula]")
|
||||
self._context = self._store.newGraph(graph) # type: ignore[assignment]
|
||||
|
||||
while 1:
|
||||
i = self.skipSpace(argstr, j)
|
||||
if i < 0:
|
||||
self.BadSyntax(argstr, i, "needed '}', found end.")
|
||||
|
||||
if argstr[i : i + 1] == "}":
|
||||
j = i + 1
|
||||
break
|
||||
|
||||
j = self.directiveOrStatement(argstr, i)
|
||||
if j < 0:
|
||||
self.BadSyntax(argstr, i, "expected statement or '}'")
|
||||
|
||||
self._context = self._parentContext
|
||||
self._reason2 = reason2
|
||||
self._parentContext = oldParentContext
|
||||
# res.append(subj.close()) # No use until closed
|
||||
return j
|
||||
|
||||
|
||||
class TrigParser(Parser):
|
||||
"""
|
||||
An RDFLib parser for TriG
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def parse(self, source: InputSource, graph: Graph, encoding: str = "utf-8") -> None:
|
||||
if encoding not in [None, "utf-8"]:
|
||||
raise Exception(
|
||||
# type error: Unsupported left operand type for % ("Tuple[str, str]")
|
||||
("TriG files are always utf-8 encoded, ", "I was passed: %s") # type: ignore[operator]
|
||||
% encoding
|
||||
)
|
||||
|
||||
# we're currently being handed a Graph, not a ConjunctiveGraph
|
||||
assert graph.store.context_aware, "TriG Parser needs a context-aware store!"
|
||||
|
||||
conj_graph = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
|
||||
conj_graph.default_context = graph # TODO: CG __init__ should have a
|
||||
# default_context arg
|
||||
# TODO: update N3Processor so that it can use conj_graph as the sink
|
||||
conj_graph.namespace_manager = graph.namespace_manager
|
||||
|
||||
sink = RDFSink(conj_graph)
|
||||
|
||||
baseURI = conj_graph.absolutize( # noqa: N806
|
||||
source.getPublicId() or source.getSystemId() or ""
|
||||
)
|
||||
p = TrigSinkParser(sink, baseURI=baseURI, turtle=True)
|
||||
|
||||
stream = source.getCharacterStream() # try to get str stream first
|
||||
if not stream:
|
||||
# fallback to get the bytes stream
|
||||
stream = source.getByteStream()
|
||||
p.loadStream(stream)
|
||||
|
||||
for prefix, namespace in p._bindings.items():
|
||||
conj_graph.bind(prefix, namespace)
|
||||
|
||||
# return ???
|
||||
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
A TriX parser for RDFLib
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
|
||||
from xml.sax import handler, make_parser
|
||||
from xml.sax.handler import ErrorHandler
|
||||
|
||||
from rdflib.exceptions import ParserError
|
||||
from rdflib.graph import Graph
|
||||
from rdflib.namespace import Namespace
|
||||
from rdflib.parser import InputSource, Parser
|
||||
from rdflib.store import Store
|
||||
from rdflib.term import BNode, Identifier, Literal, URIRef
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# from xml.sax.expatreader import ExpatLocator
|
||||
from xml.sax.xmlreader import AttributesImpl, Locator, XMLReader
|
||||
|
||||
__all__ = ["create_parser", "TriXHandler", "TriXParser"]
|
||||
|
||||
|
||||
TRIXNS = Namespace("http://www.w3.org/2004/03/trix/trix-1/")
|
||||
XMLNS = Namespace("http://www.w3.org/XML/1998/namespace")
|
||||
|
||||
|
||||
class TriXHandler(handler.ContentHandler):
|
||||
"""An Sax Handler for TriX. See http://sw.nokia.com/trix/"""
|
||||
|
||||
lang: Optional[str]
|
||||
datatype: Optional[str]
|
||||
|
||||
def __init__(self, store: Store):
|
||||
self.store = store
|
||||
self.preserve_bnode_ids = False
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
self.bnode: Dict[str, BNode] = {}
|
||||
self.graph: Optional[Graph] = None
|
||||
self.triple: Optional[List[Identifier]] = None
|
||||
self.state = 0
|
||||
self.lang = None
|
||||
self.datatype = None
|
||||
|
||||
# ContentHandler methods
|
||||
|
||||
def setDocumentLocator(self, locator: Locator):
|
||||
self.locator = locator
|
||||
|
||||
def startDocument(self) -> None:
|
||||
pass
|
||||
|
||||
def startPrefixMapping(self, prefix: Optional[str], namespace: str) -> None:
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix: Optional[str]) -> None:
|
||||
pass
|
||||
|
||||
def startElementNS(
|
||||
self, name: Tuple[Optional[str], str], qname, attrs: AttributesImpl
|
||||
) -> None:
|
||||
if name[0] != str(TRIXNS):
|
||||
self.error(
|
||||
"Only elements in the TriX namespace are allowed. %s!=%s"
|
||||
% (name[0], TRIXNS)
|
||||
)
|
||||
|
||||
if name[1].lower() == "trix":
|
||||
if self.state == 0:
|
||||
self.state = 1
|
||||
else:
|
||||
self.error("Unexpected TriX element")
|
||||
|
||||
elif name[1] == "graph":
|
||||
if self.state == 1:
|
||||
self.state = 2
|
||||
else:
|
||||
self.error("Unexpected graph element")
|
||||
|
||||
elif name[1] == "uri":
|
||||
if self.state == 2:
|
||||
# the context uri
|
||||
self.state = 3
|
||||
elif self.state == 4:
|
||||
# part of a triple
|
||||
pass
|
||||
else:
|
||||
self.error("Unexpected uri element")
|
||||
|
||||
elif name[1] == "triple":
|
||||
if self.state == 2:
|
||||
if self.graph is None:
|
||||
# anonymous graph, create one with random bnode id
|
||||
self.graph = Graph(store=self.store)
|
||||
# start of a triple
|
||||
self.triple = []
|
||||
self.state = 4
|
||||
else:
|
||||
self.error("Unexpected triple element")
|
||||
|
||||
elif name[1] == "typedLiteral":
|
||||
if self.state == 4:
|
||||
# part of triple
|
||||
self.lang = None
|
||||
self.datatype = None
|
||||
|
||||
try:
|
||||
self.lang = attrs.getValue((str(XMLNS), "lang")) # type: ignore[arg-type, unused-ignore]
|
||||
except Exception:
|
||||
# language not required - ignore
|
||||
pass
|
||||
try:
|
||||
self.datatype = attrs.getValueByQName("datatype") # type: ignore[arg-type, unused-ignore]
|
||||
except KeyError:
|
||||
self.error("No required attribute 'datatype'")
|
||||
else:
|
||||
self.error("Unexpected typedLiteral element")
|
||||
|
||||
elif name[1] == "plainLiteral":
|
||||
if self.state == 4:
|
||||
# part of triple
|
||||
self.lang = None
|
||||
self.datatype = None
|
||||
try:
|
||||
# type error: Argument 1 to "getValue" of "AttributesImpl" has incompatible type "Tuple[str, str]"; expected "str"
|
||||
self.lang = attrs.getValue((str(XMLNS), "lang")) # type: ignore[arg-type, unused-ignore]
|
||||
except Exception:
|
||||
# language not required - ignore
|
||||
pass
|
||||
|
||||
else:
|
||||
self.error("Unexpected plainLiteral element")
|
||||
|
||||
elif name[1] == "id":
|
||||
if self.state == 2:
|
||||
# the context uri
|
||||
self.state = 3
|
||||
|
||||
elif self.state == 4:
|
||||
# part of triple
|
||||
pass
|
||||
else:
|
||||
self.error("Unexpected id element")
|
||||
|
||||
else:
|
||||
self.error("Unknown element %s in TriX namespace" % name[1])
|
||||
|
||||
self.chars = ""
|
||||
|
||||
def endElementNS(self, name: Tuple[Optional[str], str], qname) -> None:
|
||||
if TYPE_CHECKING:
|
||||
assert self.triple is not None
|
||||
if name[0] != str(TRIXNS):
|
||||
self.error(
|
||||
"Only elements in the TriX namespace are allowed. %s!=%s"
|
||||
% (name[0], TRIXNS)
|
||||
)
|
||||
|
||||
if name[1] == "uri":
|
||||
if self.state == 3:
|
||||
self.graph = Graph(
|
||||
store=self.store, identifier=URIRef(self.chars.strip())
|
||||
)
|
||||
self.state = 2
|
||||
elif self.state == 4:
|
||||
self.triple += [URIRef(self.chars.strip())]
|
||||
else:
|
||||
self.error(
|
||||
"Illegal internal self.state - This should never "
|
||||
+ "happen if the SAX parser ensures XML syntax correctness"
|
||||
)
|
||||
|
||||
elif name[1] == "id":
|
||||
if self.state == 3:
|
||||
self.graph = Graph(
|
||||
self.store, identifier=self.get_bnode(self.chars.strip())
|
||||
)
|
||||
self.state = 2
|
||||
elif self.state == 4:
|
||||
self.triple += [self.get_bnode(self.chars.strip())]
|
||||
else:
|
||||
self.error(
|
||||
"Illegal internal self.state - This should never "
|
||||
+ "happen if the SAX parser ensures XML syntax correctness"
|
||||
)
|
||||
|
||||
elif name[1] == "plainLiteral" or name[1] == "typedLiteral":
|
||||
if self.state == 4:
|
||||
self.triple += [
|
||||
Literal(self.chars, lang=self.lang, datatype=self.datatype)
|
||||
]
|
||||
else:
|
||||
self.error(
|
||||
"This should never happen if the SAX parser "
|
||||
+ "ensures XML syntax correctness"
|
||||
)
|
||||
|
||||
elif name[1] == "triple":
|
||||
if self.state == 4:
|
||||
if len(self.triple) != 3:
|
||||
self.error(
|
||||
"Triple has wrong length, got %d elements: %s"
|
||||
% (len(self.triple), self.triple)
|
||||
)
|
||||
# type error: Item "None" of "Optional[Graph]" has no attribute "add"
|
||||
# type error: Argument 1 to "add" of "Graph" has incompatible type "List[Identifier]"; expected "Tuple[Node, Node, Node]"
|
||||
self.graph.add(self.triple) # type: ignore[union-attr, arg-type]
|
||||
# self.store.store.add(self.triple,context=self.graph)
|
||||
# self.store.addN([self.triple+[self.graph]])
|
||||
self.state = 2
|
||||
else:
|
||||
self.error(
|
||||
"This should never happen if the SAX parser "
|
||||
+ "ensures XML syntax correctness"
|
||||
)
|
||||
|
||||
elif name[1] == "graph":
|
||||
self.graph = None
|
||||
self.state = 1
|
||||
|
||||
elif name[1].lower() == "trix":
|
||||
self.state = 0
|
||||
|
||||
else:
|
||||
self.error("Unexpected close element")
|
||||
|
||||
def get_bnode(self, label: str) -> BNode:
|
||||
if self.preserve_bnode_ids:
|
||||
bn = BNode(label)
|
||||
else:
|
||||
if label in self.bnode:
|
||||
bn = self.bnode[label]
|
||||
else:
|
||||
bn = BNode(label)
|
||||
self.bnode[label] = bn
|
||||
return bn
|
||||
|
||||
def characters(self, content: str) -> None:
|
||||
self.chars += content
|
||||
|
||||
def ignorableWhitespace(self, content) -> None:
|
||||
pass
|
||||
|
||||
def processingInstruction(self, target, data) -> None:
|
||||
pass
|
||||
|
||||
def error(self, message: str) -> NoReturn:
|
||||
locator = self.locator
|
||||
info = "%s:%s:%s: " % (
|
||||
locator.getSystemId(),
|
||||
locator.getLineNumber(),
|
||||
locator.getColumnNumber(),
|
||||
)
|
||||
raise ParserError(info + message)
|
||||
|
||||
|
||||
def create_parser(store: Store) -> XMLReader:
|
||||
parser = make_parser()
|
||||
try:
|
||||
# Workaround for bug in expatreader.py. Needed when
|
||||
# expatreader is trying to guess a prefix.
|
||||
# type error: "XMLReader" has no attribute "start_namespace_decl"
|
||||
parser.start_namespace_decl("xml", "http://www.w3.org/XML/1998/namespace") # type: ignore[attr-defined]
|
||||
except AttributeError:
|
||||
pass # Not present in Jython (at least)
|
||||
parser.setFeature(handler.feature_namespaces, 1)
|
||||
trix = TriXHandler(store)
|
||||
parser.setContentHandler(trix)
|
||||
parser.setErrorHandler(ErrorHandler())
|
||||
return parser
|
||||
|
||||
|
||||
class TriXParser(Parser):
|
||||
"""A parser for TriX. See http://sw.nokia.com/trix/"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def parse(self, source: InputSource, sink: Graph, **args: Any) -> None:
|
||||
assert (
|
||||
sink.store.context_aware
|
||||
), "TriXParser must be given a context aware store."
|
||||
|
||||
self._parser = create_parser(sink.store)
|
||||
content_handler = self._parser.getContentHandler()
|
||||
preserve_bnode_ids = args.get("preserve_bnode_ids", None)
|
||||
if preserve_bnode_ids is not None:
|
||||
# type error: ContentHandler has no attribute "preserve_bnode_ids"
|
||||
content_handler.preserve_bnode_ids = preserve_bnode_ids # type: ignore[attr-defined, unused-ignore]
|
||||
# We're only using it once now
|
||||
# content_handler.reset()
|
||||
# self._parser.reset()
|
||||
self._parser.parse(source)
|
||||
Reference in New Issue
Block a user