2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,676 @@
+"""
+Implementation of the JSON-LD Context structure. See:
+
+    http://json-ld.org/
+
+"""
+
+# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/context.py
+from __future__ import annotations
+
+from collections import namedtuple
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Collection,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+from urllib.parse import urljoin, urlsplit
+
+from rdflib.namespace import RDF
+
+from .errors import (
+    INVALID_CONTEXT_ENTRY,
+    INVALID_REMOTE_CONTEXT,
+    RECURSIVE_CONTEXT_INCLUSION,
+)
+from .keys import (
+    BASE,
+    CONTAINER,
+    CONTEXT,
+    GRAPH,
+    ID,
+    IMPORT,
+    INCLUDED,
+    INDEX,
+    JSON,
+    LANG,
+    LIST,
+    NEST,
+    NONE,
+    PREFIX,
+    PROPAGATE,
+    PROTECTED,
+    REV,
+    SET,
+    TYPE,
+    VALUE,
+    VERSION,
+    VOCAB,
+)
+from .util import norm_url, source_to_json, split_iri
+
+NODE_KEYS = {GRAPH, ID, INCLUDED, JSON, LIST, NEST, NONE, REV, SET, TYPE, VALUE, LANG}
+
+
+class Defined(int):
+    pass
+
+
+UNDEF = Defined(0)
+
+# From <https://tools.ietf.org/html/rfc3986#section-2.2>
+URI_GEN_DELIMS = (":", "/", "?", "#", "[", "]", "@")
+
+_ContextSourceType = Union[
+    List[Union[Dict[str, Any], str, None]], Dict[str, Any], str, None
+]
+
+
+class Context:
+    def __init__(
+        self,
+        source: _ContextSourceType = None,
+        base: Optional[str] = None,
+        version: Optional[float] = 1.1,
+    ):
+        self.version: float = version or 1.1
+        self.language = None
+        self.vocab: Optional[str] = None
+        self._base: Optional[str]
+        self.base = base
+        self.doc_base = base
+        self.terms: Dict[str, Any] = {}
+        # _alias maps NODE_KEY to list of aliases
+        self._alias: Dict[str, List[str]] = {}
+        self._lookup: Dict[Tuple[str, Any, Union[Defined, str], bool], Term] = {}
+        self._prefixes: Dict[str, Any] = {}
+        self.active = False
+        self.parent: Optional[Context] = None
+        self.propagate = True
+        self._context_cache: Dict[str, Any] = {}
+        if source:
+            self.load(source)
+
+    @property
+    def base(self) -> Optional[str]:
+        return self._base
+
+    @base.setter
+    def base(self, base: Optional[str]):
+        if base:
+            hash_index = base.find("#")
+            if hash_index > -1:
+                base = base[0:hash_index]
+        self._base = (
+            self.resolve_iri(base)
+            if (hasattr(self, "_base") and base is not None)
+            else base
+        )
+        self._basedomain = "%s://%s" % urlsplit(base)[0:2] if base else None
+
+    def subcontext(self, source: Any, propagate: bool = True) -> Context:
+        # IMPROVE: to optimize, implement SubContext with parent fallback support
+        parent = self.parent if self.propagate is False else self
+        # type error: Item "None" of "Optional[Context]" has no attribute "_subcontext"
+        return parent._subcontext(source, propagate)  # type: ignore[union-attr]
+
+    def _subcontext(self, source: Any, propagate: bool) -> Context:
+        ctx = Context(version=self.version)
+        ctx.propagate = propagate
+        ctx.parent = self
+        ctx.language = self.language
+        ctx.vocab = self.vocab
+        ctx.base = self.base
+        ctx.doc_base = self.doc_base
+        ctx._alias = {k: l[:] for k, l in self._alias.items()}  # noqa: E741
+        ctx.terms = self.terms.copy()
+        ctx._lookup = self._lookup.copy()
+        ctx._prefixes = self._prefixes.copy()
+        ctx._context_cache = self._context_cache
+        ctx.load(source)
+        return ctx
+
+    def _clear(self) -> None:
+        self.language = None
+        self.vocab = None
+        self.terms = {}
+        self._alias = {}
+        self._lookup = {}
+        self._prefixes = {}
+        self.active = False
+        self.propagate = True
+
+    def get_context_for_term(self, term: Optional[Term]) -> Context:
+        if term and term.context is not UNDEF:
+            return self._subcontext(term.context, propagate=True)
+        return self
+
+    def get_context_for_type(self, node: Any) -> Optional[Context]:
+        if self.version >= 1.1:
+            rtype = self.get_type(node) if isinstance(node, dict) else None
+            if not isinstance(rtype, list):
+                rtype = [rtype] if rtype else []
+
+            typeterm = None
+            for rt in rtype:
+                try:
+                    typeterm = self.terms.get(rt)
+                except TypeError:
+                    # extra lenience, triggers if type is set to a literal
+                    pass
+                if typeterm is not None:
+                    break
+
+            if typeterm and typeterm.context:
+                subcontext = self.subcontext(typeterm.context, propagate=False)
+                if subcontext:
+                    return subcontext
+
+        return self.parent if self.propagate is False else self
+
+    def get_id(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, ID)
+
+    def get_type(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, TYPE)
+
+    def get_language(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, LANG)
+
+    def get_value(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, VALUE)
+
+    def get_graph(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, GRAPH)
+
+    def get_list(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, LIST)
+
+    def get_set(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, SET)
+
+    def get_rev(self, obj: Dict[str, Any]) -> Any:
+        return self._get(obj, REV)
+
+    def _get(self, obj: Dict[str, Any], key: str) -> Any:
+        for alias in self._alias.get(key, []):
+            if alias in obj:
+                return obj.get(alias)
+        return obj.get(key)
+
+    # type error: Missing return statement
+    def get_key(self, key: str) -> str:  # type: ignore[return]
+        for alias in self.get_keys(key):
+            return alias
+
+    def get_keys(self, key: str) -> Generator[str, None, None]:
+        if key in self._alias:
+            for alias in self._alias[key]:
+                yield alias
+        yield key
+
+    lang_key = property(lambda self: self.get_key(LANG))
+    id_key = property(lambda self: self.get_key(ID))
+    type_key = property(lambda self: self.get_key(TYPE))
+    value_key = property(lambda self: self.get_key(VALUE))
+    list_key = property(lambda self: self.get_key(LIST))
+    rev_key = property(lambda self: self.get_key(REV))
+    graph_key = property(lambda self: self.get_key(GRAPH))
+
+    def add_term(
+        self,
+        name: str,
+        idref: str,
+        coercion: Union[Defined, str] = UNDEF,
+        container: Union[Collection[Any], str, Defined] = UNDEF,
+        index: Optional[Union[str, Defined]] = None,
+        language: Optional[Union[str, Defined]] = UNDEF,
+        reverse: bool = False,
+        context: Any = UNDEF,
+        prefix: Optional[bool] = None,
+        protected: bool = False,
+    ):
+        if self.version < 1.1 or prefix is None:
+            prefix = isinstance(idref, str) and idref.endswith(URI_GEN_DELIMS)
+
+        if not self._accept_term(name):
+            return
+
+        if self.version >= 1.1:
+            existing = self.terms.get(name)
+            if existing and existing.protected:
+                return
+
+        if isinstance(container, (list, set, tuple)):
+            container = set(container)
+        elif container is not UNDEF:
+            container = set([container])
+        else:
+            container = set()
+
+        term = Term(
+            idref,
+            name,
+            coercion,
+            container,
+            index,
+            language,
+            reverse,
+            context,
+            prefix,
+            protected,
+        )
+
+        self.terms[name] = term
+
+        container_key: Union[Defined, str]
+        for container_key in (LIST, LANG, SET):  # , INDEX, ID, GRAPH):
+            if container_key in container:
+                break
+        else:
+            container_key = UNDEF
+
+        self._lookup[(idref, coercion or language, container_key, reverse)] = term
+
+        if term.prefix is True:
+            self._prefixes[idref] = name
+
+    def find_term(
+        self,
+        idref: str,
+        coercion: Optional[Union[str, Defined]] = None,
+        container: Union[Defined, str] = UNDEF,
+        language: Optional[str] = None,
+        reverse: bool = False,
+    ):
+        lu = self._lookup
+
+        if coercion is None:
+            coercion = language
+
+        if coercion is not UNDEF and container:
+            found = lu.get((idref, coercion, container, reverse))
+            if found:
+                return found
+
+        if coercion is not UNDEF:
+            found = lu.get((idref, coercion, UNDEF, reverse))
+            if found:
+                return found
+
+        if container:
+            found = lu.get((idref, coercion, container, reverse))
+            if found:
+                return found
+        elif language:
+            found = lu.get((idref, UNDEF, LANG, reverse))
+            if found:
+                return found
+        else:
+            found = lu.get((idref, coercion or UNDEF, SET, reverse))
+            if found:
+                return found
+
+        return lu.get((idref, UNDEF, UNDEF, reverse))
+
+    def resolve(self, curie_or_iri: str) -> str:
+        iri = self.expand(curie_or_iri, False)
+        # type error: Argument 1 to "isblank" of "Context" has incompatible type "Optional[str]"; expected "str"
+        if self.isblank(iri):  # type: ignore[arg-type]
+            # type error: Incompatible return value type (got "Optional[str]", expected "str")
+            return iri  # type: ignore[return-value]
+        # type error: Unsupported right operand type for in ("Optional[str]")
+        if " " in iri:  # type: ignore[operator]
+            return ""
+        # type error: Argument 1 to "resolve_iri" of "Context" has incompatible type "Optional[str]"; expected "str"
+        return self.resolve_iri(iri)  # type: ignore[arg-type]
+
+    def resolve_iri(self, iri: str) -> str:
+        # type error: Argument 1 to "norm_url" has incompatible type "Optional[str]"; expected "str"
+        return norm_url(self._base, iri)  # type: ignore[arg-type]
+
+    def isblank(self, ref: str) -> bool:
+        return ref.startswith("_:")
+
+    def expand(self, term_curie_or_iri: Any, use_vocab: bool = True) -> Optional[str]:
+        if not isinstance(term_curie_or_iri, str):
+            return term_curie_or_iri
+
+        if not self._accept_term(term_curie_or_iri):
+            return ""
+
+        if use_vocab:
+            term = self.terms.get(term_curie_or_iri)
+            if term:
+                return term.id
+
+        is_term, pfx, local = self._prep_expand(term_curie_or_iri)
+        if pfx == "_":
+            return term_curie_or_iri
+
+        if pfx is not None:
+            ns = self.terms.get(pfx)
+            if ns and ns.prefix and ns.id:
+                return ns.id + local
+        elif is_term and use_vocab:
+            if self.vocab:
+                return self.vocab + term_curie_or_iri
+            return None
+
+        return self.resolve_iri(term_curie_or_iri)
+
+    def shrink_iri(self, iri: str) -> str:
+        ns, name = split_iri(str(iri))
+        pfx = self._prefixes.get(ns)
+        if pfx:
+            # type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Any, Optional[str]]"; expected "Iterable[str]"
+            return ":".join((pfx, name))  # type: ignore[arg-type]
+        elif self._base:
+            if str(iri) == self._base:
+                return ""
+            # type error: Argument 1 to "startswith" of "str" has incompatible type "Optional[str]"; expected "Union[str, Tuple[str, ...]]"
+            elif iri.startswith(self._basedomain):  # type: ignore[arg-type]
+                # type error: Argument 1 to "len" has incompatible type "Optional[str]"; expected "Sized"
+                return iri[len(self._basedomain) :]  # type: ignore[arg-type]
+        return iri
+
+    def to_symbol(self, iri: str) -> Optional[str]:
+        iri = str(iri)
+        term = self.find_term(iri)
+        if term:
+            return term.name
+        ns, name = split_iri(iri)
+        if ns == self.vocab:
+            return name
+        pfx = self._prefixes.get(ns)
+        if pfx:
+            # type error: Argument 1 to "join" of "str" has incompatible type "Tuple[Any, Optional[str]]"; expected "Iterable[str]"
+            return ":".join((pfx, name))  # type: ignore[arg-type]
+        return iri
+
+    def load(
+        self,
+        source: _ContextSourceType,
+        base: Optional[str] = None,
+        referenced_contexts: Set[Any] = None,
+    ):
+        self.active = True
+        sources: List[Tuple[Optional[str], Union[Dict[str, Any], str, None]]] = []
+        # "Union[List[Union[Dict[str, Any], str]], List[Dict[str, Any]], List[str]]" : expression
+        # "Union[List[Dict[str, Any]], Dict[str, Any], List[str], str]" : variable
+        source = source if isinstance(source, list) else [source]
+        referenced_contexts = referenced_contexts or set()
+        self._prep_sources(base, source, sources, referenced_contexts)
+        for source_url, source in sources:
+            if source is None:
+                self._clear()
+            else:
+                # type error: Argument 1 to "_read_source" of "Context" has incompatible type "Union[Dict[str, Any], str]"; expected "Dict[str, Any]"
+                self._read_source(source, source_url, referenced_contexts)  # type: ignore[arg-type]
+
+    def _accept_term(self, key: str) -> bool:
+        if self.version < 1.1:
+            return True
+        if key and len(key) > 1 and key[0] == "@" and key[1].isalnum():
+            return key in NODE_KEYS
+        else:
+            return True
+
+    def _prep_sources(
+        self,
+        base: Optional[str],
+        inputs: Union[List[Union[Dict[str, Any], str, None]], List[str]],
+        sources: List[Tuple[Optional[str], Union[Dict[str, Any], str, None]]],
+        referenced_contexts: Set[str],
+        in_source_url: Optional[str] = None,
+    ):
+        for source in inputs:
+            source_url = in_source_url
+            new_base = base
+            if isinstance(source, str):
+                source_url = source
+                source_doc_base = base or self.doc_base
+                new_ctx = self._fetch_context(
+                    source, source_doc_base, referenced_contexts
+                )
+                if new_ctx is None:
+                    continue
+                else:
+                    if base:
+                        if TYPE_CHECKING:
+                            # if base is not None, then source_doc_base won't be
+                            # none due to how it is assigned.
+                            assert source_doc_base is not None
+                        new_base = urljoin(source_doc_base, source_url)
+                    source = new_ctx
+
+            if isinstance(source, dict):
+                if CONTEXT in source:
+                    source = source[CONTEXT]
+                    # type ignore: Incompatible types in assignment (expression has type "List[Union[Dict[str, Any], str, None]]", variable has type "Union[Dict[str, Any], str, None]")
+                    source = source if isinstance(source, list) else [source]  # type: ignore[assignment]
+
+            if isinstance(source, list):
+                # type error: Statement is unreachable
+                self._prep_sources(  # type: ignore[unreachable]
+                    new_base, source, sources, referenced_contexts, source_url
+                )
+            else:
+                sources.append((source_url, source))
+
+    def _fetch_context(
+        self, source: str, base: Optional[str], referenced_contexts: Set[str]
+    ):
+        # type error: Value of type variable "AnyStr" of "urljoin" cannot be "Optional[str]"
+        source_url = urljoin(base, source)  # type: ignore[type-var]
+
+        if source_url in referenced_contexts:
+            raise RECURSIVE_CONTEXT_INCLUSION
+
+        # type error: Argument 1 to "add" of "set" has incompatible type "Optional[str]"; expected "str"
+        referenced_contexts.add(source_url)  # type: ignore[arg-type]
+
+        if source_url in self._context_cache:
+            return self._context_cache[source_url]
+
+        # type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "str")
+        source_json, _ = source_to_json(source_url)
+        if source_json and CONTEXT not in source_json:
+            raise INVALID_REMOTE_CONTEXT
+
+        # type error: Invalid index type "Optional[str]" for "Dict[str, Any]"; expected type "str"
+        self._context_cache[source_url] = source_json  # type: ignore[index]
+
+        return source_json
+
+    def _read_source(
+        self,
+        source: Dict[str, Any],
+        source_url: Optional[str] = None,
+        referenced_contexts: Optional[Set[str]] = None,
+    ):
+        imports = source.get(IMPORT)
+        if imports:
+            if not isinstance(imports, str):
+                raise INVALID_CONTEXT_ENTRY
+
+            imported = self._fetch_context(
+                imports, self.base, referenced_contexts or set()
+            )
+            if not isinstance(imported, dict):
+                raise INVALID_CONTEXT_ENTRY
+
+            imported = imported[CONTEXT]
+            imported.update(source)
+            source = imported
+
+        self.vocab = source.get(VOCAB, self.vocab)
+        self.version = source.get(VERSION, self.version)
+        protected = source.get(PROTECTED, False)
+
+        for key, value in source.items():
+            if key in {VOCAB, VERSION, IMPORT, PROTECTED}:
+                continue
+            elif key == PROPAGATE and isinstance(value, bool):
+                self.propagate = value
+            elif key == LANG:
+                self.language = value
+            elif key == BASE:
+                if not source_url and not imports:
+                    self.base = value
+            else:
+                self._read_term(source, key, value, protected)
+
+    def _read_term(
+        self,
+        source: Dict[str, Any],
+        name: str,
+        dfn: Union[Dict[str, Any], str],
+        protected: bool = False,
+    ) -> None:
+        idref = None
+        if isinstance(dfn, dict):
+            # term = self._create_term(source, key, value)
+            rev = dfn.get(REV)
+            protected = dfn.get(PROTECTED, protected)
+
+            coercion = dfn.get(TYPE, UNDEF)
+            if coercion and coercion not in (ID, TYPE, VOCAB):
+                coercion = self._rec_expand(source, coercion)
+
+            idref = rev or dfn.get(ID, UNDEF)
+            if idref == TYPE:
+                idref = str(RDF.type)
+                coercion = VOCAB
+            elif idref is not UNDEF:
+                idref = self._rec_expand(source, idref)
+            elif ":" in name:
+                idref = self._rec_expand(source, name)
+            elif self.vocab:
+                idref = self.vocab + name
+
+            context = dfn.get(CONTEXT, UNDEF)
+
+            self.add_term(
+                name,
+                idref,
+                coercion,
+                dfn.get(CONTAINER, UNDEF),
+                dfn.get(INDEX, UNDEF),
+                dfn.get(LANG, UNDEF),
+                bool(rev),
+                context,
+                dfn.get(PREFIX),
+                protected=protected,
+            )
+        else:
+            if isinstance(dfn, str):
+                if not self._accept_term(dfn):
+                    return
+                idref = self._rec_expand(source, dfn)
+            # type error: Argument 2 to "add_term" of "Context" has incompatible type "Optional[str]"; expected "str"
+            self.add_term(name, idref, protected=protected)  # type: ignore[arg-type]
+
+        if idref in NODE_KEYS:
+            self._alias.setdefault(idref, []).append(name)
+        else:
+            # undo aliases that may have been inherited from parent context
+            for v in self._alias.values():
+                if name in v:
+                    v.remove(name)
+
+    def _rec_expand(
+        self, source: Dict[str, Any], expr: Optional[str], prev: Optional[str] = None
+    ) -> Optional[str]:
+        if expr == prev or expr in NODE_KEYS:
+            return expr
+
+        nxt: Optional[str]
+        # type error: Argument 1 to "_prep_expand" of "Context" has incompatible type "Optional[str]"; expected "str"
+        is_term, pfx, nxt = self._prep_expand(expr)  # type: ignore[arg-type]
+        if pfx:
+            iri = self._get_source_id(source, pfx)
+            if iri is None:
+                if pfx + ":" == self.vocab:
+                    return expr
+                else:
+                    term = self.terms.get(pfx)
+                    if term:
+                        iri = term.id
+
+            if iri is None:
+                nxt = expr
+            else:
+                nxt = iri + nxt
+        else:
+            nxt = self._get_source_id(source, nxt) or nxt
+            if ":" not in nxt and self.vocab:
+                return self.vocab + nxt
+
+        return self._rec_expand(source, nxt, expr)
+
+    def _prep_expand(self, expr: str) -> Tuple[bool, Optional[str], str]:
+        if ":" not in expr:
+            return True, None, expr
+        pfx, local = expr.split(":", 1)
+        if not local.startswith("//"):
+            return False, pfx, local
+        else:
+            return False, None, expr
+
+    def _get_source_id(self, source: Dict[str, Any], key: str) -> Optional[str]:
+        # .. from source dict or if already defined
+        term = source.get(key)
+        if term is None:
+            dfn = self.terms.get(key)
+            if dfn:
+                term = dfn.id
+        elif isinstance(term, dict):
+            term = term.get(ID)
+        return term
+
+    def _term_dict(self, term: Term) -> Union[Dict[str, Any], str]:
+        tdict: Dict[str, Any] = {}
+        if term.type != UNDEF:
+            tdict[TYPE] = self.shrink_iri(term.type)
+        if term.container:
+            tdict[CONTAINER] = list(term.container)
+        if term.language != UNDEF:
+            tdict[LANG] = term.language
+        if term.reverse:
+            tdict[REV] = term.id
+        else:
+            tdict[ID] = term.id
+        if tdict.keys() == {ID}:
+            return tdict[ID]
+        return tdict
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary representation of the context that can be
+        serialized to JSON.
+
+        :return: a dictionary representation of the context.
+        """
+        r = {v: k for (k, v) in self._prefixes.items()}
+        r.update({term.name: self._term_dict(term) for term in self._lookup.values()})
+        if self.base:
+            r[BASE] = self.base
+        if self.language:
+            r[LANG] = self.language
+        return r
+
+
+Term = namedtuple(
+    "Term",
+    "id, name, type, container, index, language, reverse, context," "prefix, protected",
+)
+
+Term.__new__.__defaults__ = (UNDEF, UNDEF, UNDEF, UNDEF, False, UNDEF, False, False)
@@ -0,0 +1,9 @@
+# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/errors.py
+class JSONLDException(ValueError):  # noqa: N818
+    pass
+
+
+# http://www.w3.org/TR/json-ld-api/#idl-def-JsonLdErrorCode.{code-message}
+RECURSIVE_CONTEXT_INCLUSION = JSONLDException("recursive context inclusion")
+INVALID_REMOTE_CONTEXT = JSONLDException("invalid remote context")
+INVALID_CONTEXT_ENTRY = JSONLDException("invalid context entry")
@@ -0,0 +1,24 @@
+# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/keys.py
+BASE = "@base"
+CONTAINER = "@container"
+CONTEXT = "@context"
+# DIRECTION = u'@direction'
+GRAPH = "@graph"
+ID = "@id"
+IMPORT = "@import"
+INCLUDED = "@included"
+INDEX = "@index"
+JSON = "@json"
+LANG = LANGUAGE = "@language"
+LIST = "@list"
+NEST = "@nest"
+NONE = "@none"
+PREFIX = "@prefix"
+PROPAGATE = "@propagate"
+PROTECTED = "@protected"
+REV = REVERSE = "@reverse"
+SET = "@set"
+TYPE = "@type"
+VALUE = "@value"
+VERSION = "@version"
+VOCAB = "@vocab"
@@ -0,0 +1,355 @@
+# https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/util.py
+from __future__ import annotations
+
+import json
+import pathlib
+from html.parser import HTMLParser
+from io import StringIO, TextIOBase, TextIOWrapper
+from typing import IO, TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Tuple, Union
+
+if TYPE_CHECKING:
+    import json
+else:
+    try:
+        import json
+
+        assert json  # workaround for pyflakes issue #13
+    except ImportError:
+        import simplejson as json
+
+from posixpath import normpath, sep
+from typing import TYPE_CHECKING, cast
+from urllib.parse import urljoin, urlsplit, urlunsplit
+
+try:
+    import orjson
+
+    _HAS_ORJSON = True
+except ImportError:
+    orjson = None  # type: ignore[assignment, unused-ignore]
+    _HAS_ORJSON = False
+
+
+from rdflib.parser import (
+    BytesIOWrapper,
+    InputSource,
+    PythonInputSource,
+    StringInputSource,
+    URLInputSource,
+    create_input_source,
+)
+
+
+def source_to_json(
+    source: Optional[
+        Union[IO[bytes], TextIO, InputSource, str, bytes, pathlib.PurePath]
+    ],
+    fragment_id: Optional[str] = None,
+    extract_all_scripts: Optional[bool] = False,
+) -> Tuple[Union[Dict, List[Dict]], Any]:
+    """Extract JSON from a source document.
+
+    The source document can be JSON or HTML with embedded JSON script elements (type attribute = "application/ld+json").
+    To process as HTML ``source.content_type`` must be set to "text/html" or "application/xhtml+xml".
+
+    :param source: the input source document (JSON or HTML)
+
+    :param fragment_id: if source is an HTML document then extract only the script element with matching id attribute, defaults to None
+
+    :param extract_all_scripts: if source is an HTML document then extract all script elements (unless fragment_id is provided), defaults to False (extract only the first script element)
+
+    :return: Tuple with the extracted JSON document and value of the HTML base element
+    """
+
+    if isinstance(source, PythonInputSource):
+        return source.data, None
+
+    if isinstance(source, StringInputSource):
+        # A StringInputSource is assumed to be never a HTMLJSON doc
+        html_base: Any = None
+        # We can get the original string from the StringInputSource
+        # It's hidden in the BytesIOWrapper 'wrapped' attribute
+        b_stream = source.getByteStream()
+        original_string: Optional[str] = None
+        json_dict: Union[Dict, List[Dict]]
+        if isinstance(b_stream, BytesIOWrapper):
+            wrapped_inner = cast(Union[str, StringIO, TextIOBase], b_stream.wrapped)
+            if isinstance(wrapped_inner, str):
+                original_string = wrapped_inner
+            elif isinstance(wrapped_inner, StringIO):
+                original_string = wrapped_inner.getvalue()
+        if _HAS_ORJSON:
+            if original_string is not None:
+                json_dict = orjson.loads(original_string)
+            elif isinstance(b_stream, BytesIOWrapper):
+                # use the CharacterStream instead
+                c_stream = source.getCharacterStream()
+                json_dict = orjson.loads(c_stream.read())
+            else:
+                # orjson assumes its in utf-8 encoding so
+                # don't bother to check the source.getEncoding()
+                json_dict = orjson.loads(b_stream.read())
+        else:
+            if original_string is not None:
+                json_dict = json.loads(original_string)
+            else:
+                json_dict = json.load(source.getCharacterStream())
+        return json_dict, html_base
+
+    # TODO: conneg for JSON (fix support in rdflib's URLInputSource!)
+    source = create_input_source(source, format="json-ld")
+    try:
+        content_type = source.content_type
+    except (AttributeError, LookupError):
+        content_type = None
+
+    is_html = content_type is not None and content_type.lower() in (
+        "text/html",
+        "application/xhtml+xml",
+    )
+    if is_html:
+        html_docparser: Optional[HTMLJSONParser] = HTMLJSONParser(
+            fragment_id=fragment_id, extract_all_scripts=extract_all_scripts
+        )
+    else:
+        html_docparser = None
+    try:
+        b_stream = source.getByteStream()
+    except (AttributeError, LookupError):
+        b_stream = None
+    try:
+        c_stream = source.getCharacterStream()
+    except (AttributeError, LookupError):
+        c_stream = None
+    if b_stream is None and c_stream is None:
+        raise ValueError(
+            f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
+        )
+    try:
+        b_encoding: Optional[str] = None if b_stream is None else source.getEncoding()
+    except (AttributeError, LookupError):
+        b_encoding = None
+    underlying_string: Optional[str] = None
+    if b_stream is not None and isinstance(b_stream, BytesIOWrapper):
+        # Try to find an underlying wrapped Unicode string to use?
+        wrapped_inner = b_stream.wrapped
+        if isinstance(wrapped_inner, str):
+            underlying_string = wrapped_inner
+        elif isinstance(wrapped_inner, StringIO):
+            underlying_string = wrapped_inner.getvalue()
+    try:
+        if is_html and html_docparser is not None:
+            # Offload parsing to the HTMLJSONParser
+            if underlying_string is not None:
+                html_string: str = underlying_string
+            elif c_stream is not None:
+                html_string = c_stream.read()
+            else:
+                if TYPE_CHECKING:
+                    assert b_stream is not None
+                if b_encoding is None:
+                    b_encoding = "utf-8"
+                html_string = TextIOWrapper(b_stream, encoding=b_encoding).read()
+            html_docparser.feed(html_string)
+            json_dict, html_base = html_docparser.get_json(), html_docparser.get_base()
+        elif _HAS_ORJSON:
+            html_base = None
+            if underlying_string is not None:
+                json_dict = orjson.loads(underlying_string)
+            elif (
+                (b_stream is not None and isinstance(b_stream, BytesIOWrapper))
+                or b_stream is None
+            ) and c_stream is not None:
+                # use the CharacterStream instead
+                json_dict = orjson.loads(c_stream.read())
+            else:
+                if TYPE_CHECKING:
+                    assert b_stream is not None
+                # b_stream is not None
+                json_dict = orjson.loads(b_stream.read())
+        else:
+            html_base = None
+            if underlying_string is not None:
+                return json.loads(underlying_string)
+            if c_stream is not None:
+                use_stream = c_stream
+            else:
+                if TYPE_CHECKING:
+                    assert b_stream is not None
+                # b_stream is not None
+                if b_encoding is None:
+                    b_encoding = "utf-8"
+                use_stream = TextIOWrapper(b_stream, encoding=b_encoding)
+            json_dict = json.load(use_stream)
+        return json_dict, html_base
+    finally:
+        if b_stream is not None:
+            try:
+                b_stream.close()
+            except AttributeError:
+                pass
+        if c_stream is not None:
+            try:
+                c_stream.close()
+            except AttributeError:
+                pass
+
+
+VOCAB_DELIMS = ("#", "/", ":")
+
+
+def split_iri(iri: str) -> Tuple[str, Optional[str]]:
+    for delim in VOCAB_DELIMS:
+        at = iri.rfind(delim)
+        if at > -1:
+            return iri[: at + 1], iri[at + 1 :]
+    return iri, None
+
+
+def norm_url(base: str, url: str) -> str:
+    """
+    >>> norm_url('http://example.org/', '/one')
+    'http://example.org/one'
+    >>> norm_url('http://example.org/', '/one#')
+    'http://example.org/one#'
+    >>> norm_url('http://example.org/one', 'two')
+    'http://example.org/two'
+    >>> norm_url('http://example.org/one/', 'two')
+    'http://example.org/one/two'
+    >>> norm_url('http://example.org/', 'http://example.net/one')
+    'http://example.net/one'
+    >>> norm_url('http://example.org/', 'http://example.org//one')
+    'http://example.org//one'
+    """
+    if "://" in url:
+        return url
+
+    # Fix for URNs
+    parsed_base = urlsplit(base)
+    parsed_url = urlsplit(url)
+    if parsed_url.scheme:
+        # Assume full URL
+        return url
+    if parsed_base.scheme in ("urn", "urn-x"):
+        # No scheme -> assume relative and join paths
+        base_path_parts = parsed_base.path.split("/", 1)
+        base_path = "/" + (base_path_parts[1] if len(base_path_parts) > 1 else "")
+        joined_path = urljoin(base_path, parsed_url.path)
+        fragment = f"#{parsed_url.fragment}" if parsed_url.fragment else ""
+        result = f"{parsed_base.scheme}:{base_path_parts[0]}{joined_path}{fragment}"
+    else:
+        parts = urlsplit(urljoin(base, url))
+        path = normpath(parts[2])
+        if sep != "/":
+            path = "/".join(path.split(sep))
+        if parts[2].endswith("/") and not path.endswith("/"):
+            path += "/"
+        result = urlunsplit(parts[0:2] + (path,) + parts[3:])
+    if url.endswith("#") and not result.endswith("#"):
+        result += "#"
+    return result
+
+
+# type error: Missing return statement
+def context_from_urlinputsource(source: URLInputSource) -> Optional[str]:  # type: ignore[return]
+    """
+    Please note that JSON-LD documents served with the application/ld+json media type
+    MUST have all context information, including references to external contexts,
+    within the body of the document. Contexts linked via a
+    http://www.w3.org/ns/json-ld#context HTTP Link Header MUST be
+    ignored for such documents.
+    """
+    if source.content_type != "application/ld+json":
+        try:
+            # source.links is the new way of getting Link headers from URLInputSource
+            links = source.links
+        except AttributeError:
+            # type error: Return value expected
+            return  # type: ignore[return-value]
+        for link in links:
+            if ' rel="http://www.w3.org/ns/json-ld#context"' in link:
+                i, j = link.index("<"), link.index(">")
+                if i > -1 and j > -1:
+                    # type error: Value of type variable "AnyStr" of "urljoin" cannot be "Optional[str]"
+                    return urljoin(source.url, link[i + 1 : j])  # type: ignore[type-var]
+
+
+__all__ = [
+    "json",
+    "source_to_json",
+    "split_iri",
+    "norm_url",
+    "context_from_urlinputsource",
+    "orjson",
+    "_HAS_ORJSON",
+]
+
+
+class HTMLJSONParser(HTMLParser):
+    def __init__(
+        self,
+        fragment_id: Optional[str] = None,
+        extract_all_scripts: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.fragment_id = fragment_id
+        self.json: List[Dict] = []
+        self.contains_json = False
+        self.fragment_id_does_not_match = False
+        self.base = None
+        self.extract_all_scripts = extract_all_scripts
+        self.script_count = 0
+
+    def handle_starttag(self, tag, attrs):
+        self.contains_json = False
+        self.fragment_id_does_not_match = False
+
+        # Only set self. contains_json to True if the
+        # type is 'application/ld+json'
+        if tag == "script":
+            for attr, value in attrs:
+                if attr == "type" and value == "application/ld+json":
+                    self.contains_json = True
+                elif attr == "id" and self.fragment_id and value != self.fragment_id:
+                    self.fragment_id_does_not_match = True
+
+        elif tag == "base":
+            for attr, value in attrs:
+                if attr == "href":
+                    self.base = value
+
+    def handle_data(self, data):
+        # Only do something when we know the context is a
+        # script element containing application/ld+json
+
+        if self.contains_json is True and self.fragment_id_does_not_match is False:
+
+            if not self.extract_all_scripts and self.script_count > 0:
+                return
+
+            if data.strip() == "":
+                # skip empty data elements
+                return
+
+            # Try to parse the json
+            if _HAS_ORJSON:
+                # orjson can load a unicode string
+                # if that's the only thing we have,
+                # its not worth encoding it to bytes
+                parsed = orjson.loads(data)
+            else:
+                parsed = json.loads(data)
+
+            # Add to the result document
+            if isinstance(parsed, list):
+                self.json.extend(parsed)
+            else:
+                self.json.append(parsed)
+
+            self.script_count += 1
+
+    def get_json(self) -> List[Dict]:
+        return self.json
+
+    def get_base(self):
+        return self.base