2025-12-01
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
# csvw - https://w3c.github.io/csvw/primer/
|
||||
|
||||
from .metadata import (
|
||||
TableGroup, Table, Column, ForeignKey, Link, NaturalLanguage, Datatype, URITemplate, CSVW,
|
||||
Dialect)
|
||||
|
||||
from .dsv import (UnicodeWriter,
|
||||
UnicodeReader, UnicodeReaderWithLineNumber, UnicodeDictReader, NamedTupleReader,
|
||||
iterrows, rewrite)
|
||||
|
||||
__all__ = [
|
||||
'TableGroup',
|
||||
'Table', 'Column', 'ForeignKey',
|
||||
'Link', 'NaturalLanguage',
|
||||
'Datatype',
|
||||
'URITemplate',
|
||||
'Dialect', 'UnicodeWriter',
|
||||
'UnicodeReader', 'UnicodeReaderWithLineNumber', 'UnicodeDictReader', 'NamedTupleReader',
|
||||
'iterrows', 'rewrite',
|
||||
'CSVW',
|
||||
]
|
||||
|
||||
__title__ = 'csvw'
|
||||
__version__ = '3.5.1'
|
||||
__author__ = 'Robert Forkel'
|
||||
__license__ = 'Apache 2.0, see LICENSE'
|
||||
__copyright__ = 'Copyright (c) 2024 Robert Forkel'
|
||||
@@ -0,0 +1,164 @@
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import pathlib
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
from colorama import init, Fore, Style
|
||||
|
||||
from csvw import CSVW, TableGroup
|
||||
from csvw.db import Database
|
||||
from csvw.utils import metadata2markdown
|
||||
|
||||
|
||||
def parsed_args(desc, args, *argspecs):
|
||||
if args is None: # pragma: no cover
|
||||
parser = argparse.ArgumentParser(description=desc)
|
||||
for kw, kwargs in argspecs:
|
||||
parser.add_argument(*kw, **kwargs)
|
||||
return parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def exit(ret, test=False):
|
||||
if test:
|
||||
return ret
|
||||
sys.exit(ret) # pragma: no cover
|
||||
|
||||
|
||||
def csvwdescribe(args=None, test=False):
|
||||
frictionless = shutil.which('frictionless')
|
||||
if not frictionless: # pragma: no cover
|
||||
raise ValueError('The frictionless command must be installed for this functionality!\n'
|
||||
'Run `pip install frictionless` and try again.')
|
||||
|
||||
args = parsed_args(
|
||||
"Describe a (set of) CSV file(s) with basic CSVW metadata.",
|
||||
args,
|
||||
(['--delimiter'], dict(default=None)),
|
||||
(['csv'], dict(nargs='+', help="CSV files to describe as CSVW TableGroup")),
|
||||
)
|
||||
fargs = ['describe', '--json']
|
||||
if args.delimiter:
|
||||
fargs.extend(['--dialect', '{"delimiter": "%s"}' % args.delimiter])
|
||||
onefile = False
|
||||
if len(args.csv) == 1 and '*' not in args.csv[0]:
|
||||
onefile = True
|
||||
# Make sure we infer a tabular-data schema even if the file suffix does not suggest a CSV
|
||||
# file.
|
||||
fargs.extend(['--format', 'csv'])
|
||||
else:
|
||||
fargs.extend(['--type', 'package'])
|
||||
|
||||
dp = json.loads(subprocess.check_output([frictionless] + fargs + args.csv))
|
||||
if onefile:
|
||||
dp = dict(resources=[dp], profile='data-package')
|
||||
|
||||
tg = TableGroup.from_frictionless_datapackage(dp)
|
||||
print(json.dumps(tg.asdict(), indent=4))
|
||||
return exit(0, test=test)
|
||||
|
||||
|
||||
def csvwvalidate(args=None, test=False):
|
||||
init()
|
||||
args = parsed_args(
|
||||
"Validate a (set of) CSV file(s) described by CSVW metadata.",
|
||||
args,
|
||||
(['url'], dict(help='URL or local path to CSV or JSON metadata file.')),
|
||||
(['-v', '--verbose'], dict(action='store_true', default=False)),
|
||||
)
|
||||
ret = 0
|
||||
try:
|
||||
csvw = CSVW(args.url, validate=True)
|
||||
if csvw.is_valid:
|
||||
print(Style.BRIGHT + Fore.GREEN + 'OK')
|
||||
else:
|
||||
ret = 1
|
||||
print(Style.BRIGHT + Fore.RED + 'FAIL')
|
||||
if args.verbose:
|
||||
for w in csvw.warnings:
|
||||
print(Style.DIM + str(w.message))
|
||||
except ValueError as e:
|
||||
ret = 2
|
||||
print(Style.BRIGHT + Fore.RED + 'FAIL')
|
||||
if args.verbose:
|
||||
print(Style.DIM + Fore.BLUE + str(e))
|
||||
return exit(ret, test=test)
|
||||
|
||||
|
||||
def csvw2datasette(args=None, test=False):
|
||||
args = parsed_args(
|
||||
"Convert CSVW to data for datasette (https://datasette.io/).",
|
||||
args,
|
||||
(['url'], dict(help='URL or local path to CSV or JSON metadata file.')),
|
||||
(['-o', '--outdir'], dict(type=pathlib.Path, default=pathlib.Path('.'))),
|
||||
)
|
||||
dbname, mdname = 'datasette.db', 'datasette-metadata.json'
|
||||
csvw = CSVW(args.url)
|
||||
db = Database(csvw.tablegroup, fname=args.outdir / dbname)
|
||||
db.write_from_tg()
|
||||
md = {}
|
||||
for k in ['title', 'description', 'license']:
|
||||
if 'dc:{}'.format(k) in csvw.common_props:
|
||||
md[k] = csvw.common_props['dc:{}'.format(k)]
|
||||
# FIXME: flesh out, see https://docs.datasette.io/en/stable/metadata.html
|
||||
args.outdir.joinpath(mdname).write_text(json.dumps(md, indent=4))
|
||||
print("""Run
|
||||
datasette {} --metadata {}
|
||||
and open your browser at
|
||||
http://localhost:8001/
|
||||
to browse the data.
|
||||
""".format(args.outdir / dbname, args.outdir / mdname))
|
||||
return exit(0, test=test)
|
||||
|
||||
|
||||
def csvw2json(args=None, test=False):
|
||||
args = parsed_args(
|
||||
"Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/",
|
||||
args,
|
||||
(['url'], dict(help='URL or local path to CSV or JSON metadata file.')),
|
||||
)
|
||||
csvw = CSVW(args.url)
|
||||
print(json.dumps(csvw.to_json(), indent=4))
|
||||
return exit(0, test=test)
|
||||
|
||||
|
||||
def csvw2sqlite(args=None, test=False): # pragma: no cover
|
||||
args = parsed_args(
|
||||
"Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/",
|
||||
args,
|
||||
(
|
||||
['url'],
|
||||
dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n'
|
||||
'Note that not all valid CSVW datasets can be converted to SQLite. One '
|
||||
'limitation is that all tables which are referenced by foreign keys must '
|
||||
'have a primary key.')),
|
||||
(
|
||||
['output'],
|
||||
dict(help='Path for the generated SQLite database file.')),
|
||||
)
|
||||
tg = TableGroup.from_file(args.url)
|
||||
db = Database(tg, args.output)
|
||||
db.write_from_tg(_force=True)
|
||||
return exit(0, test=test)
|
||||
|
||||
|
||||
def csvw2markdown(args=None, test=False):
|
||||
args = parsed_args(
|
||||
"Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/",
|
||||
args,
|
||||
(
|
||||
['url'],
|
||||
dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n'
|
||||
'Note that not all valid CSVW datasets can be converted to SQLite. One '
|
||||
'limitation is that all tables which are referenced by foreign keys must '
|
||||
'have a primary key.')),
|
||||
)
|
||||
tg = TableGroup.from_file(args.url)
|
||||
print(metadata2markdown(tg, link_files=True))
|
||||
return exit(0, test=test)
|
||||
|
||||
|
||||
if __name__ == '__main__': # pragma: no cover
|
||||
csvw2json()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,581 @@
|
||||
"""
|
||||
SQLite as alternative storage backend for a TableGroup's data.
|
||||
|
||||
For the most part, translation of a TableGroup's tableSchema to SQL works as expected:
|
||||
|
||||
- each table is converted to a `CREATE TABLE` statement
|
||||
- each column specifies a column in the corresponding `CREATE TABLE` statement
|
||||
- `foreignKey` constraints are added according to the corresponding `tableSchema` property.
|
||||
|
||||
List-valued foreignKeys are supported as follows: For each pair of tables related through a
|
||||
list-valued foreign key, an association table is created. To make it possible to distinguish
|
||||
multiple list-valued foreign keys between the same two tables, the association table has
|
||||
a column `context`, which stores the name of the foreign key column from which a row in the
|
||||
assocation table was created.
|
||||
|
||||
Other list-valued columns work in two different ways: If the atomic datatype is `string`, the
|
||||
specified separator is used to create a concatenated string representation in the database field.
|
||||
Otherwise, the list of values is serialized as JSON.
|
||||
|
||||
SQL table and column names can be customized by passing a translator callable when instantiating
|
||||
a :class:`Database`.
|
||||
|
||||
SQLite support has the following limitations:
|
||||
|
||||
- regex constraints on strings (as specified via a :class:`csvw.Datatype`'s format attribute) are
|
||||
not enforced by the database.
|
||||
"""
|
||||
import json
|
||||
import typing
|
||||
import decimal
|
||||
import pathlib
|
||||
import sqlite3
|
||||
import functools
|
||||
import contextlib
|
||||
import collections
|
||||
|
||||
import attr
|
||||
|
||||
import csvw
|
||||
from csvw.datatypes import DATATYPES
|
||||
from csvw.metadata import TableGroup
|
||||
|
||||
|
||||
def identity(s):
|
||||
return s
|
||||
|
||||
|
||||
TYPE_MAP = {
|
||||
'string': (
|
||||
'TEXT',
|
||||
identity,
|
||||
identity),
|
||||
'integer': (
|
||||
'INTEGER',
|
||||
identity,
|
||||
identity),
|
||||
'boolean': (
|
||||
'INTEGER',
|
||||
lambda s: s if s is None else int(s),
|
||||
lambda s: s if s is None else bool(s)),
|
||||
'decimal': (
|
||||
'REAL',
|
||||
lambda s: s if s is None else float(s),
|
||||
lambda s: s if s is None else decimal.Decimal(s)),
|
||||
'hexBinary': (
|
||||
'BLOB',
|
||||
identity,
|
||||
identity),
|
||||
}
|
||||
|
||||
|
||||
class SchemaTranslator(typing.Protocol):
|
||||
def __call__(self, table: str, column: typing.Optional[str] = None) -> str:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class ColumnTranslator(typing.Protocol):
|
||||
def __call__(self, column: str) -> str:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
def quoted(*names):
|
||||
return ','.join('`{0}`'.format(name) for name in names)
|
||||
|
||||
|
||||
def insert(db: sqlite3.Connection,
|
||||
translate: SchemaTranslator,
|
||||
table: str,
|
||||
keys: typing.Sequence[str],
|
||||
*rows: list,
|
||||
single: typing.Optional[bool] = False):
|
||||
"""
|
||||
Insert a sequence of rows into a table.
|
||||
|
||||
:param db: Database connection.
|
||||
:param translate: Callable translating table and column names to proper schema object names.
|
||||
:param table: Untranslated table name.
|
||||
:param keys: Untranslated column names.
|
||||
:param rows: Sequence of rows to insert.
|
||||
:param single: Flag signaling whether to insert all rows at once using `executemany` or one at \
|
||||
a time, allowing for more focused debugging output in case of errors.
|
||||
"""
|
||||
if rows:
|
||||
sql = "INSERT INTO {0} ({1}) VALUES ({2})".format(
|
||||
quoted(translate(table)),
|
||||
quoted(*[translate(table, k) for k in keys]),
|
||||
','.join(['?' for _ in keys]))
|
||||
try:
|
||||
db.executemany(sql, rows)
|
||||
except: # noqa: E722 - this is purely for debugging.
|
||||
if not single:
|
||||
for row in rows:
|
||||
insert(db, translate, table, keys, row, single=True)
|
||||
else:
|
||||
print(sql)
|
||||
print(rows)
|
||||
raise
|
||||
|
||||
|
||||
def select(db: sqlite3.Connection, table: str) -> typing.Tuple[typing.List[str], typing.Sequence]:
|
||||
cu = db.execute("SELECT * FROM {0}".format(quoted(table)))
|
||||
cols = [d[0] for d in cu.description]
|
||||
return cols, list(cu.fetchall())
|
||||
|
||||
|
||||
@attr.s
|
||||
class ColSpec:
|
||||
"""
|
||||
A `ColSpec` captures sufficient information about a :class:`csvw.Column` for the DB schema.
|
||||
"""
|
||||
name = attr.ib()
|
||||
csvw_type = attr.ib(default='string', converter=lambda s: s if s else 'string')
|
||||
separator = attr.ib(default=None)
|
||||
db_type = attr.ib(default=None)
|
||||
convert = attr.ib(default=None)
|
||||
read = attr.ib(default=None)
|
||||
required = attr.ib(default=False)
|
||||
csvw = attr.ib(default=None)
|
||||
|
||||
def __attrs_post_init__(self):
|
||||
if self.csvw_type in TYPE_MAP:
|
||||
self.db_type, self.convert, self.read = TYPE_MAP[self.csvw_type]
|
||||
else:
|
||||
self.db_type = 'TEXT'
|
||||
self.convert = DATATYPES[self.csvw_type].to_string
|
||||
self.read = DATATYPES[self.csvw_type].to_python
|
||||
if self.separator and self.db_type != 'TEXT':
|
||||
self.db_type = 'TEXT'
|
||||
|
||||
def check(self, translate: ColumnTranslator) -> typing.Optional[str]:
|
||||
"""
|
||||
We try to convert as many data constraints as possible into SQLite CHECK constraints.
|
||||
|
||||
:param translate: Callable to translate column names between CSVW metadata and DB schema.
|
||||
:return: A string suitable as argument of an SQL CHECK constraint.
|
||||
"""
|
||||
if not self.csvw:
|
||||
return
|
||||
c, cname = self.csvw, translate(self.name)
|
||||
constraints = []
|
||||
if (c.minimum is not None) or (c.maximum is not None):
|
||||
func = {
|
||||
'date': 'date',
|
||||
'datetime': 'datetime',
|
||||
}.get(self.csvw_type)
|
||||
if c.minimum is not None:
|
||||
if func:
|
||||
constraints.append("{2}(`{0}`) >= {2}('{1}')".format(cname, c.minimum, func))
|
||||
else:
|
||||
constraints.append('`{0}` >= {1}'.format(cname, c.minimum))
|
||||
if c.maximum is not None:
|
||||
if func:
|
||||
constraints.append("{2}(`{0}`) <= {2}('{1}')".format(cname, c.maximum, func))
|
||||
else:
|
||||
constraints.append('`{0}` <= {1}'.format(cname, c.maximum))
|
||||
elif any(cc is not None for cc in [c.length, c.minLength, c.maxLength]):
|
||||
if c.length:
|
||||
constraints.append('length(`{0}`) = {1}'.format(cname, c.length))
|
||||
if c.minLength:
|
||||
constraints.append('length(`{0}`) >= {1}'.format(cname, c.minLength))
|
||||
if c.maxLength:
|
||||
constraints.append('length(`{0}`) <= {1}'.format(cname, c.maxLength))
|
||||
return ' AND '.join(constraints)
|
||||
|
||||
def sql(self, translate: ColumnTranslator) -> str:
|
||||
_check = self.check(translate)
|
||||
return '`{0}` {1}{2}{3}'.format(
|
||||
translate(self.name),
|
||||
self.db_type,
|
||||
' NOT NULL' if self.required else '',
|
||||
' CHECK ({0})'.format(_check) if _check else '')
|
||||
|
||||
|
||||
@attr.s
|
||||
class TableSpec(object):
|
||||
"""
|
||||
A `TableSpec` captures sufficient information about a :class:`csvw.Table` for the DB schema.
|
||||
|
||||
.. note::
|
||||
|
||||
We support "light-weight" many-to-many relationships by allowing list-valued foreign key
|
||||
columns in CSVW. In the database these columns are turned into an associative table, adding
|
||||
the name of the column as value a `context` column. Thus, multiple columns in a table my be
|
||||
specified as targets of many-to-many relations with the same table.
|
||||
|
||||
.. seealso:: `<https://en.wikipedia.org/wiki/Associative_entity>`_
|
||||
"""
|
||||
name = attr.ib()
|
||||
columns = attr.ib(default=attr.Factory(list))
|
||||
foreign_keys = attr.ib(default=attr.Factory(list))
|
||||
many_to_many = attr.ib(default=attr.Factory(collections.OrderedDict))
|
||||
primary_key = attr.ib(default=None)
|
||||
|
||||
@classmethod
|
||||
def from_table_metadata(cls,
|
||||
table: csvw.Table,
|
||||
drop_self_referential_fks: typing.Optional[bool] = True) -> 'TableSpec':
|
||||
"""
|
||||
Create a `TableSpec` from the schema description of a `csvw.metadata.Table`.
|
||||
|
||||
:param table: `csvw.metadata.Table` instance.
|
||||
:param drop_self_referential_fks: Flag signaling whether to drop self-referential foreign \
|
||||
keys. This may be necessary, if the order of rows in a CSVW table does not guarantee \
|
||||
referential integrity when inserted in order (e.g. an eralier row refering to a later one).
|
||||
:return: `TableSpec` instance.
|
||||
"""
|
||||
spec = cls(name=table.local_name, primary_key=table.tableSchema.primaryKey)
|
||||
list_valued = {c.header for c in table.tableSchema.columns if c.separator}
|
||||
for fk in table.tableSchema.foreignKeys:
|
||||
# We only support Foreign Key references between tables!
|
||||
if not fk.reference.schemaReference:
|
||||
if len(fk.columnReference) == 1 and fk.columnReference[0] in list_valued:
|
||||
# List-valued foreign keys are turned into a many-to-many relation!
|
||||
assert len(fk.reference.columnReference) == 1, \
|
||||
'Composite key {0} in table {1} referenced'.format(
|
||||
fk.reference.columnReference,
|
||||
fk.reference.resource)
|
||||
assert spec.primary_key and len(spec.primary_key) == 1, \
|
||||
'Table {0} referenced by list-valued foreign key must have non-composite ' \
|
||||
'primary key'.format(spec.name)
|
||||
spec.many_to_many[fk.columnReference[0]] = TableSpec.association_table(
|
||||
spec.name,
|
||||
spec.primary_key[0],
|
||||
fk.reference.resource.string,
|
||||
fk.reference.columnReference[0],
|
||||
)
|
||||
elif not (drop_self_referential_fks and fk.reference.resource.string == spec.name):
|
||||
spec.foreign_keys.append((
|
||||
sorted(fk.columnReference),
|
||||
fk.reference.resource.string,
|
||||
sorted(fk.reference.columnReference),
|
||||
))
|
||||
for c in table.tableSchema.columns:
|
||||
if c.header not in spec.many_to_many:
|
||||
datatype = c.inherit('datatype')
|
||||
spec.columns.append(ColSpec(
|
||||
name=c.header,
|
||||
csvw_type=datatype.base if datatype else datatype,
|
||||
separator=c.inherit('separator'),
|
||||
required=c.inherit('required'),
|
||||
csvw=c.inherit('datatype'),
|
||||
))
|
||||
return spec
|
||||
|
||||
@classmethod
|
||||
def association_table(cls, atable, apk, btable, bpk) -> 'TableSpec':
|
||||
"""
|
||||
List-valued foreignKeys are supported as follows: For each pair of tables related through a
|
||||
list-valued foreign key, an association table is created. To make it possible to distinguish
|
||||
multiple list-valued foreign keys between the same two tables, the association table has
|
||||
a column `context`, which stores the name of the foreign key column from which a row in the
|
||||
assocation table was created.
|
||||
"""
|
||||
afk = ColSpec('{0}_{1}'.format(atable, apk))
|
||||
bfk = ColSpec('{0}_{1}'.format(btable, bpk))
|
||||
if afk.name == bfk.name:
|
||||
afk.name += '_1'
|
||||
bfk.name += '_2'
|
||||
return cls(
|
||||
name='{0}_{1}'.format(atable, btable),
|
||||
columns=[afk, bfk, ColSpec('context')],
|
||||
foreign_keys=[
|
||||
([afk.name], atable, [apk]),
|
||||
([bfk.name], btable, [bpk]),
|
||||
]
|
||||
)
|
||||
|
||||
def sql(self, translate: SchemaTranslator) -> str:
|
||||
"""
|
||||
:param translate:
|
||||
:return: The SQL statement to create the table.
|
||||
"""
|
||||
col_translate = functools.partial(translate, self.name)
|
||||
clauses = [col.sql(col_translate) for col in self.columns]
|
||||
if self.primary_key:
|
||||
clauses.append('PRIMARY KEY({0})'.format(quoted(
|
||||
*[col_translate(c) for c in self.primary_key])))
|
||||
for fk, ref, refcols in self.foreign_keys:
|
||||
clauses.append('FOREIGN KEY({0}) REFERENCES {1}({2}) ON DELETE CASCADE'.format(
|
||||
quoted(*[col_translate(c) for c in fk]),
|
||||
quoted(translate(ref)),
|
||||
quoted(*[translate(ref, c) for c in refcols])))
|
||||
return "CREATE TABLE IF NOT EXISTS `{0}` (\n {1}\n)".format(
|
||||
translate(self.name), ',\n '.join(clauses))
|
||||
|
||||
|
||||
def schema(tg: csvw.TableGroup,
|
||||
drop_self_referential_fks: typing.Optional[bool] = True) -> typing.List[TableSpec]:
|
||||
"""
|
||||
Convert the table and column descriptions of a `TableGroup` into specifications for the
|
||||
DB schema.
|
||||
|
||||
:param tg: CSVW TableGroup.
|
||||
:param drop_self_referential_fks: Flag signaling whether to drop self-referential foreign \
|
||||
keys. This may be necessary, if the order of rows in a CSVW table does not guarantee \
|
||||
referential integrity when inserted in order (e.g. an eralier row refering to a later one).
|
||||
:return: A pair (tables, reference_tables).
|
||||
"""
|
||||
tables = {}
|
||||
for tname, table in tg.tabledict.items():
|
||||
t = TableSpec.from_table_metadata(
|
||||
table, drop_self_referential_fks=drop_self_referential_fks)
|
||||
tables[t.name] = t
|
||||
for at in t.many_to_many.values():
|
||||
tables[at.name] = at
|
||||
|
||||
# We must determine the order in which tables must be created!
|
||||
ordered = collections.OrderedDict()
|
||||
i = 0
|
||||
|
||||
# We loop through the tables repeatedly, and whenever we find one, which has all
|
||||
# referenced tables already in ordered, we move it from tables to ordered.
|
||||
while tables and i < 100:
|
||||
i += 1
|
||||
for table in list(tables.keys()):
|
||||
if all((ref[1] in ordered) or ref[1] == table for ref in tables[table].foreign_keys):
|
||||
# All referenced tables are already created (or self-referential).
|
||||
ordered[table] = tables.pop(table)
|
||||
break
|
||||
if tables: # pragma: no cover
|
||||
raise ValueError('there seem to be cyclic dependencies between the tables')
|
||||
|
||||
return list(ordered.values())
|
||||
|
||||
|
||||
class Database(object):
|
||||
"""
|
||||
Represents a SQLite database associated with a :class:`csvw.TableGroup` instance.
|
||||
|
||||
:param tg: `TableGroup` instance defining the schema of the database.
|
||||
:param fname: Path to which to write the database file.
|
||||
:param translate: Schema object name translator.
|
||||
:param drop_self_referential_fks: Flag signaling whether to drop or enforce self-referential \
|
||||
foreign-key constraints.
|
||||
|
||||
.. warning::
|
||||
|
||||
We write rows of a table to the database sequentially. Since CSVW does not require ordering
|
||||
rows in tables such that self-referential foreign-key constraints are satisfied at each row,
|
||||
we don't enforce self-referential foreign-keys by default in order to not trigger "false"
|
||||
integrity errors. If data in a CSVW Table is known to be ordered appropriately, `False`
|
||||
should be passed as `drop_self_referential_fks` keyword parameter to enforce
|
||||
self-referential foreign-keys.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
tg: TableGroup,
|
||||
fname: typing.Optional[typing.Union[pathlib.Path, str]] = None,
|
||||
translate: typing.Optional[SchemaTranslator] = None,
|
||||
drop_self_referential_fks: typing.Optional[bool] = True,
|
||||
):
|
||||
self.translate = translate or Database.name_translator
|
||||
self.fname = pathlib.Path(fname) if fname else None
|
||||
self.init_schema(tg, drop_self_referential_fks=drop_self_referential_fks)
|
||||
self._connection = None # For in-memory dbs we need to keep the connection!
|
||||
|
||||
def init_schema(self, tg, drop_self_referential_fks=True):
|
||||
self.tg = tg
|
||||
self.tables = schema(
|
||||
self.tg, drop_self_referential_fks=drop_self_referential_fks) if self.tg else []
|
||||
|
||||
@property
|
||||
def tdict(self) -> typing.Dict[str, TableSpec]:
|
||||
return {t.name: t for t in self.tables}
|
||||
|
||||
@staticmethod
|
||||
def name_translator(table: str, column: typing.Optional[str] = None) -> str:
|
||||
"""
|
||||
A callable with this signature can be passed into DB creation to control the names
|
||||
of the schema objects.
|
||||
|
||||
:param table: CSVW name of the table before translation
|
||||
:param column: CSVW name of a column of `table` before translation
|
||||
:return: Translated table name if `column is None` else translated column name
|
||||
"""
|
||||
# By default, no translation is done:
|
||||
return column or table
|
||||
|
||||
def connection(self) -> typing.Union[sqlite3.Connection, contextlib.closing]:
|
||||
if self.fname:
|
||||
return contextlib.closing(sqlite3.connect(str(self.fname)))
|
||||
if not self._connection:
|
||||
self._connection = sqlite3.connect(':memory:')
|
||||
return self._connection
|
||||
|
||||
def select_many_to_many(self, db, table, context) -> dict:
|
||||
if context is not None:
|
||||
context_sql = "WHERE context = '{0}'".format(context)
|
||||
else:
|
||||
context_sql = ''
|
||||
sql = """\
|
||||
SELECT {0}, group_concat({1}, ' '), group_concat(COALESCE(context, ''), '||')
|
||||
FROM {2} {3} GROUP BY {0}""".format(
|
||||
quoted(self.translate(table.name, table.columns[0].name)),
|
||||
quoted(self.translate(table.name, table.columns[1].name)),
|
||||
quoted(self.translate(table.name)),
|
||||
context_sql)
|
||||
cu = db.execute(sql)
|
||||
return {
|
||||
r[0]: [(k, v) if context is None else k
|
||||
for k, v in zip(r[1].split(), r[2].split('||'))] for r in cu.fetchall()}
|
||||
|
||||
def separator(self, tname: str, cname: str) -> typing.Optional[str]:
|
||||
"""
|
||||
:return: separator for the column specified by db schema names `tname` and `cname`.
|
||||
"""
|
||||
for name in self.tdict:
|
||||
if self.translate(name) == tname:
|
||||
for col in self.tdict[name].columns:
|
||||
if self.translate(name, col.name) == cname:
|
||||
return col.separator
|
||||
|
||||
def split_value(self, tname, cname, value) -> typing.Union[typing.List[str], str, None]:
|
||||
sep = self.separator(tname, cname)
|
||||
return (value or '').split(sep) if sep else value
|
||||
|
||||
def read(self) -> typing.Dict[str, typing.List[typing.OrderedDict]]:
|
||||
"""
|
||||
:return: A `dict` where keys are SQL table names corresponding to CSVW tables and values \
|
||||
are lists of rows, represented as dicts where keys are the SQL column names.
|
||||
"""
|
||||
res = collections.defaultdict(list)
|
||||
with self.connection() as conn:
|
||||
for tname in self.tg.tabledict:
|
||||
#
|
||||
# FIXME: how much do we want to use DB types? Probably as much as possible!
|
||||
# Thus we need to convert on write **and** read!
|
||||
#
|
||||
convert, seps, refs = {}, {}, collections.defaultdict(dict)
|
||||
table = self.tdict[tname] # The TableSpec object.
|
||||
|
||||
# Assemble the conversion dictionary:
|
||||
for col in table.columns:
|
||||
convert[self.translate(tname, col.name)] = [col.name, identity]
|
||||
if col.csvw_type in TYPE_MAP:
|
||||
convert[self.translate(tname, col.name)][1] = TYPE_MAP[col.csvw_type][2]
|
||||
else:
|
||||
convert[self.translate(tname, col.name)][1] = \
|
||||
DATATYPES[col.csvw_type].to_python
|
||||
if col.separator:
|
||||
if col.csvw_type == 'string':
|
||||
seps[self.translate(tname, col.name)] = col.separator
|
||||
else:
|
||||
seps[self.translate(tname, col.name)] = 'json'
|
||||
|
||||
# Retrieve the many-to-many relations:
|
||||
for col, at in table.many_to_many.items():
|
||||
for pk, v in self.select_many_to_many(conn, at, col).items():
|
||||
refs[pk][self.translate(tname, col)] = v
|
||||
|
||||
cols, rows = select(conn, self.translate(tname))
|
||||
for row in rows:
|
||||
d = collections.OrderedDict()
|
||||
for k, v in zip(cols, row):
|
||||
if k in seps:
|
||||
if v is None:
|
||||
d[k] = None
|
||||
elif not v:
|
||||
d[k] = []
|
||||
elif seps[k] == 'json':
|
||||
d[k] = json.loads(v)
|
||||
else:
|
||||
d[k] = [convert[k][1](v_) for v_ in (v or '').split(seps[k])]
|
||||
else:
|
||||
d[k] = convert[k][1](v) if v is not None else None
|
||||
pk = d[self.translate(tname, table.primary_key[0])] \
|
||||
if table.primary_key and len(table.primary_key) == 1 else None
|
||||
d.update({k: [] for k in table.many_to_many})
|
||||
d.update(refs.get(pk, {}))
|
||||
res[self.translate(tname)].append(d)
|
||||
return res
|
||||
|
||||
def association_table_context(self, table, column, fkey):
|
||||
"""
|
||||
Context for association tables is created calling this method.
|
||||
|
||||
Note: If a custom value for the `context` column is created by overwriting this method,
|
||||
`select_many_to_many` must be adapted accordingly, to make sure the custom
|
||||
context is retrieved when reading the data from the db.
|
||||
|
||||
:param table:
|
||||
:param column:
|
||||
:param fkey:
|
||||
:return: a pair (foreign key, context)
|
||||
"""
|
||||
# The default implementation takes the column name as context:
|
||||
return fkey, column
|
||||
|
||||
def write_from_tg(self, _force=False, _exists_ok=False, _skip_extra=False):
|
||||
return self.write(
|
||||
force=_force,
|
||||
_exists_ok=_exists_ok,
|
||||
_skip_extra=_skip_extra,
|
||||
**self.tg.read())
|
||||
|
||||
def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items):
|
||||
"""
|
||||
Creates a db file with the core schema.
|
||||
|
||||
:param force: If `True` an existing db file will be overwritten.
|
||||
"""
|
||||
if self.fname and self.fname.exists():
|
||||
if not force:
|
||||
raise ValueError('db file already exists, use force=True to overwrite')
|
||||
else:
|
||||
self.fname.unlink()
|
||||
|
||||
with self.connection() as db:
|
||||
for table in self.tables:
|
||||
db.execute(table.sql(translate=self.translate))
|
||||
|
||||
db.execute('PRAGMA foreign_keys = ON;')
|
||||
db.commit()
|
||||
|
||||
refs = collections.defaultdict(list) # collects rows in association tables.
|
||||
for t in self.tables:
|
||||
if t.name not in items:
|
||||
continue
|
||||
rows, keys = [], []
|
||||
cols = {c.name: c for c in t.columns}
|
||||
for i, row in enumerate(items[t.name]):
|
||||
pk = row[t.primary_key[0]] \
|
||||
if t.primary_key and len(t.primary_key) == 1 else None
|
||||
values = []
|
||||
for k, v in row.items():
|
||||
if k in t.many_to_many:
|
||||
assert pk
|
||||
at = t.many_to_many[k]
|
||||
atkey = tuple([at.name] + [c.name for c in at.columns])
|
||||
# We distinguish None - meaning NULL - and [] - meaning no items - as
|
||||
# values of list-valued columns.
|
||||
for vv in (v or []):
|
||||
fkey, context = self.association_table_context(t, k, vv)
|
||||
refs[atkey].append((pk, fkey, context))
|
||||
else:
|
||||
if k not in cols:
|
||||
if _skip_extra:
|
||||
continue
|
||||
else:
|
||||
raise ValueError(
|
||||
'unspecified column {0} found in data'.format(k))
|
||||
col = cols[k]
|
||||
if isinstance(v, list):
|
||||
# Note: This assumes list-valued columns are of datatype string!
|
||||
if col.csvw_type == 'string':
|
||||
v = (col.separator or ';').join(
|
||||
col.convert(vv) or '' for vv in v)
|
||||
else:
|
||||
v = json.dumps(v)
|
||||
else:
|
||||
v = col.convert(v) if v is not None else None
|
||||
if i == 0:
|
||||
keys.append(col.name)
|
||||
values.append(v)
|
||||
rows.append(tuple(values))
|
||||
insert(db, self.translate, t.name, keys, *rows)
|
||||
|
||||
for atkey, rows in refs.items():
|
||||
insert(db, self.translate, atkey[0], atkey[1:], *rows)
|
||||
|
||||
db.commit()
|
||||
@@ -0,0 +1,441 @@
|
||||
"""Support for reading delimiter-separated value files.
|
||||
|
||||
This module contains unicode aware replacements for :func:`csv.reader`
|
||||
and :func:`csv.writer`. It was stolen/extracted from the ``csvkit``
|
||||
project to allow re-use when the whole ``csvkit`` package isn't
|
||||
required.
|
||||
|
||||
The original implementations were largely copied from
|
||||
`examples in the csv module documentation <http://docs.python.org/library/csv.html\
|
||||
#examples>`_.
|
||||
|
||||
.. seealso:: http://en.wikipedia.org/wiki/Delimiter-separated_values
|
||||
"""
|
||||
import io
|
||||
import csv
|
||||
import codecs
|
||||
import shutil
|
||||
import typing
|
||||
import pathlib
|
||||
import tempfile
|
||||
import warnings
|
||||
import functools
|
||||
import collections
|
||||
|
||||
from . import utils
|
||||
from .dsv_dialects import Dialect
|
||||
|
||||
__all__ = [
|
||||
'UnicodeWriter',
|
||||
'UnicodeReader', 'UnicodeReaderWithLineNumber', 'UnicodeDictReader', 'NamedTupleReader',
|
||||
'iterrows',
|
||||
'rewrite', 'add_rows', 'filter_rows_as_dict',
|
||||
]
|
||||
|
||||
LINES_OR_PATH = typing.Union[str, pathlib.Path, typing.IO, typing.Iterable[str]]
|
||||
|
||||
|
||||
def normalize_encoding(encoding: str) -> str:
|
||||
return codecs.lookup(encoding).name
|
||||
|
||||
|
||||
class UnicodeWriter:
|
||||
"""
|
||||
Write Unicode data to a csv file.
|
||||
|
||||
:param f: The target to which to write the data; a local path specified as `str` or \
|
||||
`pathlib.Path` or `None`, in which case the data, formatted as DSV can be retrieved \
|
||||
via :meth:`~UnicodeWriter.read`
|
||||
:param dialect: Either a dialect name as recognized by `csv.writer` or a \
|
||||
:class:`~Dialect` instance for dialect customization beyond what can be done with \
|
||||
`csv.writer`.
|
||||
:param kw: Keyword arguments passed through to `csv.writer`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
>>> from csvw import UnicodeWriter
|
||||
>>> with UnicodeWriter('data.tsv', delimiter='\t') as writer:
|
||||
... writer.writerow(['ä', 'ö', 'ü'])
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
f: typing.Optional[typing.Union[str, pathlib.Path]] = None,
|
||||
dialect: typing.Optional[typing.Union[Dialect, str]] = None,
|
||||
**kw):
|
||||
self.f = f
|
||||
self.encoding = kw.pop('encoding', 'utf-8')
|
||||
if isinstance(dialect, Dialect):
|
||||
self.encoding = dialect.python_encoding
|
||||
self.kw = dialect.as_python_formatting_parameters()
|
||||
self.kw.update(kw)
|
||||
else:
|
||||
self.kw = kw
|
||||
if dialect:
|
||||
self.kw['dialect'] = dialect
|
||||
self.encoding = normalize_encoding(self.encoding)
|
||||
self.escapechar = self.kw.get('escapechar')
|
||||
if self.escapechar and self.kw.get('quoting') != csv.QUOTE_NONE:
|
||||
# work around https://bugs.python.org/issue12178
|
||||
# (csv.writer doesn't escape escapechar while csv.reader expects it)
|
||||
def _escapedoubled(row,
|
||||
_type=str,
|
||||
_old=self.escapechar,
|
||||
_new=2 * self.escapechar):
|
||||
return [s.replace(_old, _new) if isinstance(s, _type) else s for s in row]
|
||||
else:
|
||||
def _escapedoubled(row):
|
||||
return row
|
||||
self._escapedoubled = _escapedoubled
|
||||
self._close = False
|
||||
|
||||
def __enter__(self):
|
||||
if isinstance(self.f, (str, pathlib.Path)):
|
||||
if isinstance(self.f, pathlib.Path):
|
||||
self.f = str(self.f)
|
||||
|
||||
self.f = io.open(self.f, 'wt', encoding=self.encoding, newline='')
|
||||
self._close = True
|
||||
elif self.f is None:
|
||||
self.f = io.StringIO(newline='')
|
||||
|
||||
self.writer = csv.writer(self.f, **self.kw)
|
||||
return self
|
||||
|
||||
def read(self) -> typing.Optional[bytes]:
|
||||
"""
|
||||
If the writer has been initialized passing `None` as target, the CSV data as `bytes` can be
|
||||
retrieved calling this method.
|
||||
"""
|
||||
if hasattr(self.f, 'seek'):
|
||||
self.f.seek(0)
|
||||
if hasattr(self.f, 'read'):
|
||||
return self.f.read().encode('utf-8')
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
if self._close:
|
||||
self.f.close()
|
||||
|
||||
def writerow(self, row: typing.Union[tuple, list]):
|
||||
self.writer.writerow(self._escapedoubled(row))
|
||||
|
||||
def writerows(self, rows: typing.Iterable[typing.Union[tuple, list]]):
|
||||
for row in rows:
|
||||
self.writerow(row)
|
||||
|
||||
|
||||
class UnicodeReader:
|
||||
"""
|
||||
Read Unicode data from a csv file.
|
||||
|
||||
:param f: The source from which to read the data; a local path specified as `str` or \
|
||||
`pathlib.Path`, a file-like object or a `list` of lines.
|
||||
:param dialect: Either a dialect name as recognized by `csv.reader` or a \
|
||||
:class:`~Dialect` instance for dialect customization beyond what can be done with \
|
||||
`csv.writer`.
|
||||
:param kw: Keyword arguments passed through to `csv.reader`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
>>> with UnicodeReader('tests/fixtures/frictionless-data.csv', delimiter='|') as reader:
|
||||
... for row in reader:
|
||||
... print(row)
|
||||
... break
|
||||
...
|
||||
['FK', 'Year', 'Location name', 'Value', 'binary', 'anyURI', 'email', 'boolean', 'array',
|
||||
'geojson']
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
f: LINES_OR_PATH,
|
||||
dialect: typing.Optional[typing.Union[Dialect, str]] = None,
|
||||
**kw):
|
||||
self.f = f
|
||||
self.encoding = normalize_encoding(kw.pop('encoding', 'utf-8-sig'))
|
||||
self.newline = kw.pop('lineterminator', None)
|
||||
self.dialect = dialect if isinstance(dialect, Dialect) else None
|
||||
if self.dialect:
|
||||
self.encoding = self.dialect.python_encoding
|
||||
self.kw = dialect.as_python_formatting_parameters()
|
||||
self.kw.update(kw)
|
||||
else:
|
||||
self.kw = kw
|
||||
if dialect:
|
||||
self.kw['dialect'] = dialect
|
||||
self._close = False
|
||||
self.comments = []
|
||||
|
||||
# We potentially screw people with valid CSV files where the content - presumably the
|
||||
# header - starts with 0xfeff. But the chance of irritating people trying to read Excel
|
||||
# exported CSV with the defaults seems way bigger - and anyone with CSV column names
|
||||
# starting with 0xfeff will run into more trouble down the line anyway ...
|
||||
if self.encoding == 'utf-8':
|
||||
self.encoding = 'utf-8-sig'
|
||||
|
||||
# encoding of self.reader rows: differs from source encoding
|
||||
# where we need to recode from non-8bit clean source encoding
|
||||
# to utf-8 first to feed into the (byte-based) PY2 csv.reader
|
||||
self._reader_encoding = self.encoding
|
||||
|
||||
def __enter__(self):
|
||||
if isinstance(self.f, (str, pathlib.Path)):
|
||||
if isinstance(self.f, pathlib.Path):
|
||||
self.f = str(self.f)
|
||||
|
||||
self.f = io.open(self.f, mode='rt', encoding=self.encoding, newline=self.newline or '')
|
||||
self._close = True
|
||||
elif not hasattr(self.f, 'read'):
|
||||
lines = []
|
||||
for line in self.f:
|
||||
lines.append(line.decode(self.encoding) if isinstance(line, bytes) else line)
|
||||
self.f = lines
|
||||
self.reader = csv.reader(self.f, **self.kw)
|
||||
self.lineno = -1
|
||||
return self
|
||||
|
||||
def _next_row(self):
|
||||
self.lineno += 1
|
||||
row = [
|
||||
s if isinstance(s, str) else s.decode(self._reader_encoding)
|
||||
for s in next(self.reader)]
|
||||
self.lineno += sum([list(s).count('\n') for s in row])
|
||||
return row
|
||||
|
||||
def __next__(self):
|
||||
row = self._next_row()
|
||||
if self.dialect:
|
||||
while (row and self.dialect.commentPrefix and # noqa: W504
|
||||
row[0].startswith(self.dialect.commentPrefix)) or \
|
||||
((not row or set(row) == {''}) and self.dialect.skipBlankRows) or \
|
||||
(self.lineno < self.dialect.skipRows):
|
||||
if (row and self.dialect.commentPrefix and # noqa: W504
|
||||
row[0].startswith(self.dialect.commentPrefix)) or \
|
||||
(row and self.lineno < self.dialect.skipRows):
|
||||
self.comments.append((
|
||||
self.lineno,
|
||||
self.dialect.delimiter.join(row).lstrip(self.dialect.commentPrefix).strip(),
|
||||
))
|
||||
row = self._next_row()
|
||||
row = [self.dialect.trimmer(s) for s in row][self.dialect.skipColumns:]
|
||||
return row
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if self._close:
|
||||
self.f.close()
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
|
||||
class UnicodeReaderWithLineNumber(UnicodeReader):
|
||||
"""
|
||||
A `UnicodeReader` yielding (lineno, row) pairs, where "lineno" is the 1-based number of the
|
||||
the **text line** where the (possibly multi-line) row data starts in the DSV file.
|
||||
"""
|
||||
def __next__(self):
|
||||
"""
|
||||
:return: a pair (1-based line number in the input, row)
|
||||
"""
|
||||
# Retrieve the row, thereby incrementing the line number:
|
||||
row = super(UnicodeReaderWithLineNumber, self).__next__()
|
||||
return self.lineno + 1, row
|
||||
|
||||
|
||||
class UnicodeDictReader(UnicodeReader):
|
||||
"""
|
||||
A `UnicodeReader` yielding one `dict` per row.
|
||||
|
||||
:param f: As for :class:`UnicodeReader`
|
||||
:param fieldnames:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
>>> with UnicodeDictReader(
|
||||
... 'tests/fixtures/frictionless-data.csv',
|
||||
... dialect=Dialect(delimiter='|', header=False),
|
||||
... fieldnames=[str(i) for i in range(1, 11)]) as reader:
|
||||
... for row in reader:
|
||||
... print(row)
|
||||
... break
|
||||
...
|
||||
OrderedDict([('1', 'FK'), ('2', 'Year'), ('3', 'Location name'), ('4', 'Value'),
|
||||
('5', 'binary'), ('6', 'anyURI'), ('7', 'email'), ('8', 'boolean'), ('9', 'array'),
|
||||
('10', 'geojson')])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, f, fieldnames=None, restkey=None, restval=None, **kw):
|
||||
self._fieldnames = fieldnames # list of keys for the dict
|
||||
self.restkey = restkey # key to catch long rows
|
||||
self.restval = restval # default value for short rows
|
||||
self.line_num = 0
|
||||
super(UnicodeDictReader, self).__init__(f, **kw)
|
||||
|
||||
@property
|
||||
def fieldnames(self):
|
||||
if self._fieldnames is None:
|
||||
try:
|
||||
self._fieldnames = super(UnicodeDictReader, self).__next__()
|
||||
except StopIteration:
|
||||
pass
|
||||
self.line_num = self.reader.line_num
|
||||
if self._fieldnames:
|
||||
if len(set(self._fieldnames)) != len(self._fieldnames):
|
||||
warnings.warn('Duplicate column names!')
|
||||
return self._fieldnames
|
||||
|
||||
def __next__(self):
|
||||
if self.line_num == 0:
|
||||
# Used only for its side effect.
|
||||
self.fieldnames
|
||||
row = super(UnicodeDictReader, self).__next__()
|
||||
self.line_num = self.reader.line_num
|
||||
|
||||
# unlike the basic reader, we prefer not to return blanks,
|
||||
# because we will typically wind up with a dict full of None
|
||||
# values
|
||||
while row == []:
|
||||
row = super(UnicodeDictReader, self).__next__()
|
||||
return self.item(row)
|
||||
|
||||
def item(self, row):
|
||||
d = collections.OrderedDict((k, v) for k, v in zip(self.fieldnames, row))
|
||||
lf = len(self.fieldnames)
|
||||
lr = len(row)
|
||||
if lf < lr:
|
||||
d[self.restkey] = row[lf:]
|
||||
elif lf > lr:
|
||||
for key in self.fieldnames[lr:]:
|
||||
d[key] = self.restval
|
||||
return d
|
||||
|
||||
|
||||
class NamedTupleReader(UnicodeDictReader):
|
||||
"""
|
||||
A `UnicodeReader` yielding one `namedtuple` per row.
|
||||
|
||||
.. note::
|
||||
|
||||
This reader has some limitations, notably that fieldnames must be normalized to be
|
||||
admissible Python names, but also bad performance (compared with `UnicodeDictReader`).
|
||||
"""
|
||||
|
||||
_normalize_fieldname = staticmethod(utils.normalize_name)
|
||||
|
||||
@functools.cached_property
|
||||
def cls(self):
|
||||
fieldnames = list(map(self._normalize_fieldname, self.fieldnames))
|
||||
return collections.namedtuple('Row', fieldnames)
|
||||
|
||||
def item(self, row):
|
||||
d = UnicodeDictReader.item(self, row)
|
||||
for name in self.fieldnames:
|
||||
d.setdefault(name, None)
|
||||
return self.cls(
|
||||
**{self._normalize_fieldname(k): v for k, v in d.items() if k in self.fieldnames})
|
||||
|
||||
|
||||
def iterrows(lines_or_file: LINES_OR_PATH,
|
||||
namedtuples: typing.Optional[bool] = False,
|
||||
dicts: typing.Optional[bool] = False,
|
||||
encoding: typing.Optional[str] = 'utf-8',
|
||||
**kw) -> typing.Generator:
|
||||
"""Convenience factory function for csv reader.
|
||||
|
||||
:param lines_or_file: Content to be read. Either a file handle, a file path or a list\
|
||||
of strings.
|
||||
:param namedtuples: Yield namedtuples.
|
||||
:param dicts: Yield dicts.
|
||||
:param encoding: Encoding of the content.
|
||||
:param kw: Keyword parameters are passed through to csv.reader.
|
||||
:return: A generator over the rows.
|
||||
"""
|
||||
if namedtuples and dicts:
|
||||
raise ValueError('either namedtuples or dicts can be chosen as output format')
|
||||
elif namedtuples:
|
||||
_reader = NamedTupleReader
|
||||
elif dicts:
|
||||
_reader = UnicodeDictReader
|
||||
else:
|
||||
_reader = UnicodeReader
|
||||
|
||||
with _reader(lines_or_file, encoding=encoding, **kw) as r:
|
||||
for item in r:
|
||||
yield item
|
||||
|
||||
|
||||
reader = iterrows
|
||||
|
||||
|
||||
def rewrite(fname: typing.Union[str, pathlib.Path],
|
||||
visitor: typing.Callable[[int, typing.List[str]], typing.Union[None, typing.List[str]]],
|
||||
**kw):
|
||||
"""Utility function to rewrite rows in dsv files.
|
||||
|
||||
:param fname: Path of the dsv file to operate on.
|
||||
:param visitor: A callable that takes a line-number and a row as input and returns a \
|
||||
(modified) row or None to filter out the row.
|
||||
:param kw: Keyword parameters are passed through to csv.reader/csv.writer.
|
||||
"""
|
||||
fname = utils.ensure_path(fname)
|
||||
assert fname.is_file()
|
||||
with tempfile.NamedTemporaryFile(delete=False) as fp:
|
||||
tmp = pathlib.Path(fp.name)
|
||||
|
||||
with UnicodeReader(fname, **kw) as reader_:
|
||||
with UnicodeWriter(tmp, **kw) as writer:
|
||||
for i, row in enumerate(reader_):
|
||||
row = visitor(i, row)
|
||||
if row is not None:
|
||||
writer.writerow(row)
|
||||
shutil.move(str(tmp), str(fname)) # Path.replace is Python 3.3+
|
||||
|
||||
|
||||
def add_rows(fname: typing.Union[str, pathlib.Path], *rows: typing.List[str]):
|
||||
with tempfile.NamedTemporaryFile(delete=False) as fp:
|
||||
tmp = pathlib.Path(fp.name)
|
||||
|
||||
fname = utils.ensure_path(fname)
|
||||
with UnicodeWriter(tmp) as writer:
|
||||
if fname.exists():
|
||||
with UnicodeReader(fname) as reader_:
|
||||
for row in reader_:
|
||||
writer.writerow(row)
|
||||
writer.writerows(rows)
|
||||
shutil.move(str(tmp), str(fname)) # Path.replace is Python 3.3+
|
||||
|
||||
|
||||
def filter_rows_as_dict(fname: typing.Union[str, pathlib.Path],
|
||||
filter_: typing.Callable[[dict], bool],
|
||||
**kw) -> int:
|
||||
"""Rewrite a dsv file, filtering the rows.
|
||||
|
||||
:param fname: Path to dsv file
|
||||
:param filter_: callable which accepts a `dict` with a row's data as single argument\
|
||||
returning a `Boolean` indicating whether to keep the row (`True`) or to discard it \
|
||||
`False`.
|
||||
:param kw: Keyword arguments to be passed `UnicodeReader` and `UnicodeWriter`.
|
||||
:return: The number of rows that have been removed.
|
||||
"""
|
||||
filter_ = DictFilter(filter_)
|
||||
rewrite(fname, filter_, **kw)
|
||||
return filter_.removed
|
||||
|
||||
|
||||
class DictFilter(object):
|
||||
|
||||
def __init__(self, filter_):
|
||||
self.header = None
|
||||
self.filter = filter_
|
||||
self.removed = 0
|
||||
|
||||
def __call__(self, i, row):
|
||||
if i == 0:
|
||||
self.header = row
|
||||
return row
|
||||
if row:
|
||||
item = dict(zip(self.header, row))
|
||||
if self.filter(item):
|
||||
return row
|
||||
else:
|
||||
self.removed += 1
|
||||
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
DSV data can be surprisingly diverse. While Python's `csv` module offers out-of-the-box support
|
||||
for the basic formatting parameters, CSVW recognizes a couple more, like `skipColumns` or
|
||||
`skipRows`.
|
||||
|
||||
.. seealso::
|
||||
|
||||
- `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_
|
||||
- `<https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters>`_
|
||||
- `<https://specs.frictionlessdata.io/csv-dialect/>`_
|
||||
"""
|
||||
import attr
|
||||
import warnings
|
||||
import functools
|
||||
|
||||
from . import utils
|
||||
|
||||
__all__ = ['Dialect']
|
||||
|
||||
ENCODING_MAP = {
|
||||
'UTF-8-BOM': 'utf-8-sig', # Recognize the name of this encoding in R.
|
||||
}
|
||||
|
||||
|
||||
# FIXME: replace with attrs.validators.ge(0) from attrs 21.3.0
|
||||
def _non_negative(instance, attribute, value):
|
||||
if value < 0: # pragma: no cover
|
||||
raise ValueError('{0} is not a valid {1}'.format(value, attribute.name))
|
||||
|
||||
|
||||
non_negative_int = [attr.validators.instance_of(int), _non_negative]
|
||||
|
||||
|
||||
def convert_encoding(s):
|
||||
s = utils.converter(str, 'utf-8', s)
|
||||
try:
|
||||
_ = 'x'.encode(ENCODING_MAP.get(s, s))
|
||||
return s
|
||||
except LookupError:
|
||||
warnings.warn('Invalid value for property: {}'.format(s))
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
@attr.s
|
||||
class Dialect(object):
|
||||
"""
|
||||
A CSV dialect specification.
|
||||
|
||||
.. seealso:: `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_
|
||||
"""
|
||||
|
||||
encoding = attr.ib(
|
||||
default='utf-8',
|
||||
converter=convert_encoding,
|
||||
validator=attr.validators.instance_of(str))
|
||||
|
||||
lineTerminators = attr.ib(
|
||||
converter=functools.partial(utils.converter, list, ['\r\n', '\n']),
|
||||
default=attr.Factory(lambda: ['\r\n', '\n']))
|
||||
|
||||
quoteChar = attr.ib(
|
||||
converter=functools.partial(utils.converter, str, '"', allow_none=True),
|
||||
default='"',
|
||||
)
|
||||
|
||||
doubleQuote = attr.ib(
|
||||
default=True,
|
||||
converter=functools.partial(utils.converter, bool, True),
|
||||
validator=attr.validators.instance_of(bool))
|
||||
|
||||
skipRows = attr.ib(
|
||||
default=0,
|
||||
converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0),
|
||||
validator=non_negative_int)
|
||||
|
||||
commentPrefix = attr.ib(
|
||||
default='#',
|
||||
converter=functools.partial(utils.converter, str, '#', allow_none=True),
|
||||
validator=attr.validators.optional(attr.validators.instance_of(str)))
|
||||
|
||||
header = attr.ib(
|
||||
default=True,
|
||||
converter=functools.partial(utils.converter, bool, True),
|
||||
validator=attr.validators.instance_of(bool))
|
||||
|
||||
headerRowCount = attr.ib(
|
||||
default=1,
|
||||
converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0),
|
||||
validator=non_negative_int)
|
||||
|
||||
delimiter = attr.ib(
|
||||
default=',',
|
||||
converter=functools.partial(utils.converter, str, ','),
|
||||
validator=attr.validators.instance_of(str))
|
||||
|
||||
skipColumns = attr.ib(
|
||||
default=0,
|
||||
converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0),
|
||||
validator=non_negative_int)
|
||||
|
||||
skipBlankRows = attr.ib(
|
||||
default=False,
|
||||
converter=functools.partial(utils.converter, bool, False),
|
||||
validator=attr.validators.instance_of(bool))
|
||||
|
||||
skipInitialSpace = attr.ib(
|
||||
default=False,
|
||||
converter=functools.partial(utils.converter, bool, False),
|
||||
validator=attr.validators.instance_of(bool))
|
||||
|
||||
trim = attr.ib(
|
||||
default='false',
|
||||
validator=attr.validators.in_(['true', 'false', 'start', 'end']),
|
||||
converter=lambda v: functools.partial(
|
||||
utils.converter,
|
||||
(str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v))
|
||||
|
||||
def updated(self, **kw):
|
||||
res = self.__class__(**attr.asdict(self))
|
||||
for k, v in kw.items():
|
||||
setattr(res, k, v)
|
||||
return res
|
||||
|
||||
@functools.cached_property
|
||||
def escape_character(self):
|
||||
return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\')
|
||||
|
||||
@functools.cached_property
|
||||
def line_terminators(self):
|
||||
return [self.lineTerminators] \
|
||||
if isinstance(self.lineTerminators, str) else self.lineTerminators
|
||||
|
||||
@functools.cached_property
|
||||
def trimmer(self):
|
||||
return {
|
||||
'true': lambda s: s.strip(),
|
||||
'false': lambda s: s,
|
||||
'start': lambda s: s.lstrip(),
|
||||
'end': lambda s: s.rstrip()
|
||||
}[self.trim]
|
||||
|
||||
def asdict(self, omit_defaults=True):
|
||||
return utils.attr_asdict(self, omit_defaults=omit_defaults)
|
||||
|
||||
@property
|
||||
def python_encoding(self):
|
||||
return ENCODING_MAP.get(self.encoding, self.encoding)
|
||||
|
||||
def as_python_formatting_parameters(self):
|
||||
return {
|
||||
'delimiter': self.delimiter,
|
||||
'doublequote': self.doubleQuote,
|
||||
# We have to hack around incompatible ways escape char is interpreted in csvw
|
||||
# and python's csv lib:
|
||||
'escapechar': self.escape_character if not self.doubleQuote else None,
|
||||
'lineterminator': self.line_terminators[0],
|
||||
'quotechar': self.quoteChar,
|
||||
'skipinitialspace': self.skipInitialSpace,
|
||||
'strict': True,
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
Functionality to convert tabular data in Frictionless Data Packages to CSVW.
|
||||
|
||||
We translate [table schemas](https://specs.frictionlessdata.io/table-schema/) defined
|
||||
for [data resources](https://specs.frictionlessdata.io/data-resource/) in a
|
||||
[data package](https://specs.frictionlessdata.io/data-package/) to a CVSW TableGroup.
|
||||
|
||||
This functionality can be used together with the `frictionless describe` command to add
|
||||
CSVW metadata to "raw" CSV tables.
|
||||
"""
|
||||
import json
|
||||
import pathlib
|
||||
|
||||
|
||||
def convert_column_spec(spec):
|
||||
"""
|
||||
https://specs.frictionlessdata.io/table-schema/#field-descriptors
|
||||
|
||||
:param spec:
|
||||
:return:
|
||||
"""
|
||||
typemap = {
|
||||
'year': 'gYear',
|
||||
'yearmonth': 'gYearMonth',
|
||||
}
|
||||
|
||||
titles = [t for t in [spec.get('title')] if t]
|
||||
|
||||
res = {'name': spec['name'], 'datatype': {'base': 'string'}}
|
||||
if 'type' in spec:
|
||||
if spec['type'] == 'string' and spec.get('format') == 'binary':
|
||||
res['datatype']['base'] = 'binary'
|
||||
elif spec['type'] == 'string' and spec.get('format') == 'uri':
|
||||
res['datatype']['base'] = 'anyURI'
|
||||
elif spec['type'] in typemap:
|
||||
res['datatype']['base'] = typemap[spec['type']]
|
||||
elif spec['type'] in [
|
||||
'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration',
|
||||
]:
|
||||
res['datatype']['base'] = spec['type']
|
||||
if spec['type'] == 'string' and spec.get('format'):
|
||||
res['datatype']['dc:format'] = spec['format']
|
||||
if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'):
|
||||
res['datatype']['format'] = '{}|{}'.format(
|
||||
spec['trueValues'][0], spec['falseValues'][0])
|
||||
if spec['type'] in ['number', 'integer']:
|
||||
if spec.get('bareNumber') is True: # pragma: no cover
|
||||
raise NotImplementedError(
|
||||
'bareNumber is not supported in CSVW. It may be possible to translate to '
|
||||
'a number pattern, though. See '
|
||||
'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/'
|
||||
'#formats-for-numeric-types')
|
||||
if any(prop in spec for prop in ['decimalChar', 'groupChar']):
|
||||
res['datatype']['format'] = {}
|
||||
for p in ['decimalChar', 'groupChar']:
|
||||
if spec.get(p):
|
||||
res['datatype']['format'][p] = spec[p]
|
||||
elif spec['type'] in ['object', 'array']:
|
||||
res['datatype']['base'] = 'json'
|
||||
res['datatype']['dc:format'] = 'application/json'
|
||||
elif spec['type'] == 'geojson':
|
||||
res['datatype']['base'] = 'json'
|
||||
res['datatype']['dc:format'] = 'application/geo+json'
|
||||
|
||||
if titles:
|
||||
res['titles'] = titles
|
||||
if 'description' in spec:
|
||||
res['dc:description'] = [spec['description']]
|
||||
if 'rdfType' in spec:
|
||||
res['propertyUrl'] = spec['rdfType']
|
||||
|
||||
constraints = spec.get('constraints', {})
|
||||
for prop in ['required', 'minLength', 'maxLength', 'minimum', 'maximum']:
|
||||
if prop in constraints:
|
||||
res['datatype'][prop] = constraints[prop]
|
||||
if ('pattern' in constraints) and ('format' not in res['datatype']):
|
||||
res['datatype']['format'] = constraints['pattern']
|
||||
# FIXME: we could transform the "enum" constraint for string into
|
||||
# a regular expression in the "format" property.
|
||||
return res
|
||||
|
||||
|
||||
def convert_foreignKey(rsc_name, fk, resource_map):
|
||||
"""
|
||||
https://specs.frictionlessdata.io/table-schema/#foreign-keys
|
||||
"""
|
||||
# Rename "fields" to "columnReference" and map resource name to url (resolving self-referential
|
||||
# foreign keys).
|
||||
return dict(
|
||||
columnReference=fk['fields'],
|
||||
reference=dict(
|
||||
columnReference=fk['reference']['fields'],
|
||||
resource=resource_map[fk['reference']['resource'] or rsc_name],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def convert_table_schema(rsc_name, schema, resource_map):
|
||||
"""
|
||||
:param rsc_name: `name` property of the resource the schema belongs to. Needed to resolve \
|
||||
self-referential foreign keys.
|
||||
:param schema: `dict` parsed from JSON representing a frictionless Table Schema object.
|
||||
:param resource_map: `dict` mapping resource names to resource paths, needed to convert foreign\
|
||||
key constraints.
|
||||
:return: `dict` suitable for instantiating a `csvw.metadata.Schema` object.
|
||||
"""
|
||||
res = dict(
|
||||
columns=[convert_column_spec(f) for f in schema['fields']],
|
||||
)
|
||||
for prop in [
|
||||
('missingValues', 'null'),
|
||||
'primaryKey',
|
||||
'foreignKeys',
|
||||
]:
|
||||
if isinstance(prop, tuple):
|
||||
prop, toprop = prop
|
||||
else:
|
||||
toprop = prop
|
||||
if prop in schema:
|
||||
res[toprop] = schema[prop]
|
||||
if prop == 'foreignKeys':
|
||||
res[toprop] = [convert_foreignKey(rsc_name, fk, resource_map) for fk in res[toprop]]
|
||||
return res
|
||||
|
||||
|
||||
def convert_dialect(rsc):
|
||||
"""
|
||||
Limitations: lineTerminator is not supported.
|
||||
|
||||
https://specs.frictionlessdata.io/csv-dialect/
|
||||
"""
|
||||
d = rsc.get('dialect', {})
|
||||
# Work around https://github.com/frictionlessdata/frictionless-py/issues/1506
|
||||
if 'csv' in d:
|
||||
d = d['csv']
|
||||
res = {}
|
||||
if d.get('delimiter'):
|
||||
res['delimiter'] = d['delimiter']
|
||||
if rsc.get('encoding'):
|
||||
res['encoding'] = rsc['encoding']
|
||||
for prop in [
|
||||
'delimiter',
|
||||
'quoteChar',
|
||||
'doubleQuote',
|
||||
'skipInitialSpace',
|
||||
'header',
|
||||
]:
|
||||
if prop in d:
|
||||
res[prop] = d[prop]
|
||||
if 'commentChar' in d:
|
||||
res['commentPrefix'] = d['commentChar']
|
||||
return res
|
||||
|
||||
|
||||
class DataPackage:
|
||||
def __init__(self, spec, directory=None):
|
||||
if isinstance(spec, DataPackage):
|
||||
self.json = spec.json
|
||||
self.dir = spec.dir
|
||||
return
|
||||
if isinstance(spec, dict):
|
||||
# already a parsed JSON object
|
||||
self.dir = pathlib.Path(directory or '.')
|
||||
elif isinstance(spec, pathlib.Path):
|
||||
self.dir = directory or spec.parent
|
||||
spec = json.loads(spec.read_text(encoding='utf8'))
|
||||
else: # assume a JSON formatted string
|
||||
spec = json.loads(spec)
|
||||
self.dir = pathlib.Path(directory or '.')
|
||||
|
||||
self.json = spec
|
||||
|
||||
def to_tablegroup(self, cls=None):
|
||||
from csvw import TableGroup
|
||||
|
||||
md = {'@context': "http://www.w3.org/ns/csvw"}
|
||||
# Package metadata:
|
||||
md['dc:replaces'] = json.dumps(self.json)
|
||||
|
||||
# version,
|
||||
# image,
|
||||
|
||||
for flprop, csvwprop in [
|
||||
('id', 'dc:identifier'),
|
||||
('licenses', 'dc:license'),
|
||||
('title', 'dc:title'),
|
||||
('homepage', 'dcat:accessURL'),
|
||||
('description', 'dc:description'),
|
||||
('sources', 'dc:source'),
|
||||
('contributors', 'dc:contributor'),
|
||||
('profile', 'dc:conformsTo'),
|
||||
('keywords', 'dc:subject'),
|
||||
('created', 'dc:created'),
|
||||
]:
|
||||
if flprop in self.json:
|
||||
md[csvwprop] = self.json[flprop]
|
||||
|
||||
if 'name' in self.json:
|
||||
if 'id' not in self.json:
|
||||
md['dc:identifier'] = self.json['name']
|
||||
elif 'title' not in self.json:
|
||||
md['dc:title'] = self.json['name']
|
||||
|
||||
# Data Resource metadata:
|
||||
resources = [rsc for rsc in self.json.get('resources', []) if 'path' in rsc]
|
||||
resource_map = {rsc['name']: rsc['path'] for rsc in resources if 'name' in rsc}
|
||||
for rsc in resources:
|
||||
schema = rsc.get('schema')
|
||||
if schema and \
|
||||
rsc.get('scheme') == 'file' and \
|
||||
rsc.get('format') == 'csv':
|
||||
# Table Schema:
|
||||
md.setdefault('tables', [])
|
||||
table = dict(
|
||||
url=rsc['path'],
|
||||
tableSchema=convert_table_schema(rsc.get('name'), schema, resource_map),
|
||||
dialect=convert_dialect(rsc),
|
||||
)
|
||||
md['tables'].append(table)
|
||||
|
||||
cls = cls or TableGroup
|
||||
res = cls.fromvalue(md)
|
||||
res._fname = self.dir / 'csvw-metadata.json'
|
||||
return res
|
||||
@@ -0,0 +1,190 @@
|
||||
import re
|
||||
import json
|
||||
import math
|
||||
import typing
|
||||
import decimal
|
||||
import pathlib
|
||||
import datetime
|
||||
import collections
|
||||
|
||||
import attr
|
||||
from rdflib import Graph, URIRef, Literal
|
||||
from rfc3986 import URIReference
|
||||
from isodate.duration import Duration
|
||||
|
||||
from .utils import is_url
|
||||
|
||||
__all__ = ['group_triples', 'to_json', 'Triple', 'format_value']
|
||||
|
||||
|
||||
def format_value(value, col):
|
||||
"""
|
||||
Format values as JSON-LD literals.
|
||||
"""
|
||||
if isinstance(value, (datetime.date, datetime.datetime, datetime.time)):
|
||||
res = value.isoformat()
|
||||
if col and col.datatype.base == 'time':
|
||||
res = res.split('T')[-1]
|
||||
if col and col.datatype.base == 'date':
|
||||
res = re.sub('T[0-9.:]+', '', res)
|
||||
if isinstance(value, (datetime.datetime, datetime.time)):
|
||||
stamp, _, milliseconds = res.partition('.')
|
||||
return '{}.{}'.format(stamp, milliseconds.rstrip('0')) if milliseconds \
|
||||
else stamp.replace('+00:00', 'Z')
|
||||
return res # pragma: no cover
|
||||
if isinstance(value, datetime.timedelta):
|
||||
return col.datatype.formatted(value)
|
||||
if isinstance(value, Duration):
|
||||
return col.datatype.formatted(value)
|
||||
if isinstance(value, decimal.Decimal):
|
||||
value = float(value)
|
||||
if isinstance(value, URIReference):
|
||||
return value.unsplit()
|
||||
if isinstance(value, bytes):
|
||||
return col.datatype.formatted(value)
|
||||
if isinstance(value, pathlib.Path):
|
||||
return str(value)
|
||||
if isinstance(value, float):
|
||||
return 'NaN' if math.isnan(value) else (
|
||||
'{}INF'.format('-' if value < 0 else '') if math.isinf(value) else value)
|
||||
return value
|
||||
|
||||
|
||||
@attr.s
|
||||
class Triple:
|
||||
"""
|
||||
A table cell's data as RDF triple.
|
||||
"""
|
||||
about = attr.ib()
|
||||
property = attr.ib()
|
||||
value = attr.ib()
|
||||
|
||||
def as_rdflib_triple(self):
|
||||
return (
|
||||
URIRef(self.about),
|
||||
URIRef(self.property),
|
||||
URIRef(self.value) if is_url(self.value) else Literal(self.value))
|
||||
|
||||
@classmethod
|
||||
def from_col(cls, table, col, row, prop, val, rownum):
|
||||
"""
|
||||
|
||||
"""
|
||||
_name = col.header if col else None
|
||||
|
||||
propertyUrl = col.propertyUrl if col else table.inherit('propertyUrl')
|
||||
if propertyUrl:
|
||||
prop = table.expand(propertyUrl, row, _row=rownum, _name=_name, qname=True)
|
||||
|
||||
is_type = prop == 'rdf:type'
|
||||
valueUrl = col.valueUrl if col else table.inherit('valueUrl')
|
||||
if valueUrl:
|
||||
val = table.expand(
|
||||
valueUrl, row, _row=rownum, _name=_name, qname=is_type, uri=not is_type)
|
||||
val = format_value(val, col)
|
||||
s = None
|
||||
aboutUrl = col.aboutUrl if col else None
|
||||
if aboutUrl:
|
||||
s = table.expand(aboutUrl, row, _row=rownum, _name=_name) or s
|
||||
return cls(about=s, property=prop, value=val)
|
||||
|
||||
|
||||
def frame(data: list) -> list:
|
||||
"""
|
||||
Inline referenced items to force a deterministic graph layout.
|
||||
|
||||
.. see:: https://w3c.github.io/json-ld-framing/#introduction
|
||||
"""
|
||||
items, refs = collections.OrderedDict(), {}
|
||||
for item in data:
|
||||
itemid = item.get('@id')
|
||||
if itemid:
|
||||
items[itemid] = item
|
||||
for vs in item.values():
|
||||
for v in [vs] if not isinstance(vs, list) else vs:
|
||||
if isinstance(v, dict):
|
||||
refid = v.get('@id')
|
||||
if refid:
|
||||
refs.setdefault(refid, (v, []))[1].append(item)
|
||||
for ref, subjects in refs.values():
|
||||
if len(subjects) == 1 and ref['@id'] in items:
|
||||
ref.update(items.pop(ref['@id']))
|
||||
return list(items.values())
|
||||
|
||||
|
||||
def to_json(obj, flatten_list=False):
|
||||
"""
|
||||
Simplify JSON-LD data by refactoring trivial objects.
|
||||
"""
|
||||
if isinstance(obj, dict):
|
||||
if '@value' in obj:
|
||||
obj = obj['@value']
|
||||
if len(obj) == 1 and '@id' in obj:
|
||||
obj = obj['@id']
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
'@type' if k == 'rdf:type' else k: to_json(v, flatten_list=flatten_list)
|
||||
for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
if len(obj) == 1 and flatten_list:
|
||||
return to_json(obj[0], flatten_list=flatten_list)
|
||||
return [to_json(v, flatten_list=flatten_list) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]:
|
||||
"""
|
||||
Group and frame triples into a `list` of JSON objects.
|
||||
"""
|
||||
merged = []
|
||||
for triple in triples:
|
||||
if isinstance(triple.value, list):
|
||||
for t in merged:
|
||||
if t.property == triple.property and isinstance(t.value, list):
|
||||
t.value.extend(triple.value)
|
||||
break
|
||||
else:
|
||||
merged.append(triple)
|
||||
else:
|
||||
merged.append(triple)
|
||||
|
||||
grouped = collections.OrderedDict()
|
||||
triples = []
|
||||
# First pass: get top-level properties.
|
||||
for triple in merged:
|
||||
if triple.about is None and triple.property == '@id':
|
||||
grouped[triple.property] = triple.value
|
||||
else:
|
||||
if not triple.about:
|
||||
# For test48
|
||||
if triple.property in grouped:
|
||||
if not isinstance(grouped[triple.property], list):
|
||||
grouped[triple.property] = [grouped[triple.property]]
|
||||
grouped[triple.property].append(triple.value)
|
||||
else:
|
||||
grouped[triple.property] = triple.value
|
||||
else:
|
||||
triples.append(triple)
|
||||
if not triples:
|
||||
return [grouped]
|
||||
|
||||
g = Graph()
|
||||
for triple in triples:
|
||||
g.add(triple.as_rdflib_triple())
|
||||
if '@id' in grouped:
|
||||
for prop, val in grouped.items():
|
||||
if prop != '@id':
|
||||
g.add(Triple(about=grouped['@id'], property=prop, value=val).as_rdflib_triple())
|
||||
res = g.serialize(format='json-ld')
|
||||
# Frame and simplify the resulting objects, augment with list index:
|
||||
res = [(i, to_json(v, flatten_list=True)) for i, v in enumerate(frame(json.loads(res)))]
|
||||
# Sort the objects making sure the one with the row's aboutUrl as @id comes first:
|
||||
res = [k[1] for k in sorted(
|
||||
res, key=lambda o: -1 if o[1].get('@id') == grouped.get('@id') else o[0])]
|
||||
# If there's no aboutUrl for the row and we have only one object from triples, we just merge
|
||||
# the properties into a single object.
|
||||
if grouped and ('@id' not in grouped) and len(res) == 1:
|
||||
grouped.update(res[0])
|
||||
return [grouped]
|
||||
|
||||
return res
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,230 @@
|
||||
import re
|
||||
import copy
|
||||
import html
|
||||
import json
|
||||
import string
|
||||
import keyword
|
||||
import pathlib
|
||||
import warnings
|
||||
import collections
|
||||
import unicodedata
|
||||
|
||||
import attr
|
||||
|
||||
|
||||
def is_url(s):
|
||||
return re.match(r'https?://', str(s))
|
||||
|
||||
|
||||
def converter(type_, default, s, allow_none=False, cond=None, allow_list=True):
|
||||
if allow_list and type_ != list and isinstance(s, list):
|
||||
return [v for v in [converter(type_, None, ss, cond=cond) for ss in s] if v is not None]
|
||||
|
||||
if allow_none and s is None:
|
||||
return s
|
||||
if not isinstance(s, type_) or (type_ == int and isinstance(s, bool)) or (cond and not cond(s)):
|
||||
warnings.warn('Invalid value for property: {}'.format(s))
|
||||
return default
|
||||
return s
|
||||
|
||||
|
||||
def ensure_path(fname):
|
||||
if not isinstance(fname, pathlib.Path):
|
||||
assert isinstance(fname, str)
|
||||
return pathlib.Path(fname)
|
||||
return fname
|
||||
|
||||
|
||||
def attr_defaults(cls):
|
||||
res = collections.OrderedDict()
|
||||
for field in attr.fields(cls):
|
||||
default = field.default
|
||||
if isinstance(default, attr.Factory):
|
||||
default = default.factory()
|
||||
res[field.name] = default
|
||||
return res
|
||||
|
||||
|
||||
def attr_asdict(obj, omit_defaults=True, omit_private=True):
|
||||
defs = attr_defaults(obj.__class__)
|
||||
res = collections.OrderedDict()
|
||||
for field in attr.fields(obj.__class__):
|
||||
if not (omit_private and field.name.startswith('_')):
|
||||
value = getattr(obj, field.name)
|
||||
if not (omit_defaults and value == defs[field.name]):
|
||||
if hasattr(value, 'asdict'):
|
||||
value = value.asdict(omit_defaults=True)
|
||||
res[field.name] = value
|
||||
return res
|
||||
|
||||
|
||||
def normalize_name(s):
|
||||
"""Convert a string into a valid python attribute name.
|
||||
This function is called to convert ASCII strings to something that can pass as
|
||||
python attribute name, to be used with namedtuples.
|
||||
|
||||
>>> str(normalize_name('class'))
|
||||
'class_'
|
||||
>>> str(normalize_name('a-name'))
|
||||
'a_name'
|
||||
>>> str(normalize_name('a n\u00e4me'))
|
||||
'a_name'
|
||||
>>> str(normalize_name('Name'))
|
||||
'Name'
|
||||
>>> str(normalize_name(''))
|
||||
'_'
|
||||
>>> str(normalize_name('1'))
|
||||
'_1'
|
||||
"""
|
||||
s = s.replace('-', '_').replace('.', '_').replace(' ', '_')
|
||||
if s in keyword.kwlist:
|
||||
return s + '_'
|
||||
s = '_'.join(slug(ss, lowercase=False) for ss in s.split('_'))
|
||||
if not s:
|
||||
s = '_'
|
||||
if s[0] not in string.ascii_letters + '_':
|
||||
s = '_' + s
|
||||
return s
|
||||
|
||||
|
||||
def slug(s, remove_whitespace=True, lowercase=True):
|
||||
"""Condensed version of s, containing only lowercase alphanumeric characters.
|
||||
|
||||
>>> str(slug('A B. \u00e4C'))
|
||||
'abac'
|
||||
"""
|
||||
res = ''.join(c for c in unicodedata.normalize('NFD', s)
|
||||
if unicodedata.category(c) != 'Mn')
|
||||
if lowercase:
|
||||
res = res.lower()
|
||||
for c in string.punctuation:
|
||||
res = res.replace(c, '')
|
||||
res = re.sub(r'\s+', '' if remove_whitespace else ' ', res)
|
||||
res = res.encode('ascii', 'ignore').decode('ascii')
|
||||
assert re.match('[ A-Za-z0-9]*$', res)
|
||||
return res
|
||||
|
||||
|
||||
def qname2url(qname):
|
||||
for prefix, uri in {
|
||||
'csvw': 'http://www.w3.org/ns/csvw#',
|
||||
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
||||
'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
|
||||
'xsd': 'http://www.w3.org/2001/XMLSchema#',
|
||||
'dc': 'http://purl.org/dc/terms/',
|
||||
'dcat': 'http://www.w3.org/ns/dcat#',
|
||||
'prov': 'http://www.w3.org/ns/prov#',
|
||||
}.items():
|
||||
if qname.startswith(prefix + ':'):
|
||||
return qname.replace(prefix + ':', uri)
|
||||
|
||||
|
||||
def metadata2markdown(tg, link_files=False) -> str:
|
||||
"""
|
||||
Render the metadata of a dataset as markdown.
|
||||
|
||||
:param link_files: If True, links to data files will be added, assuming the markdown is stored \
|
||||
in the same directory as the metadata file.
|
||||
:return: `str` with markdown formatted text
|
||||
"""
|
||||
def qname2link(qname, html=False):
|
||||
url = qname2url(qname)
|
||||
if url:
|
||||
if html:
|
||||
return '<a href="{}">{}</a>'.format(url, qname)
|
||||
return '[{}]({})'.format(qname, url)
|
||||
return qname
|
||||
|
||||
def htmlify(obj, key=None):
|
||||
"""
|
||||
For inclusion in tables we must use HTML for lists.
|
||||
"""
|
||||
if isinstance(obj, list):
|
||||
return '<ol>{}</ol>'.format(
|
||||
''.join('<li>{}</li>'.format(htmlify(item, key=key)) for item in obj))
|
||||
if isinstance(obj, dict):
|
||||
items = []
|
||||
for k, v in obj.items():
|
||||
items.append('<dt>{}</dt><dd>{}</dd>'.format(
|
||||
qname2link(k, html=True), html.escape(str(v))))
|
||||
return '<dl>{}</dl>'.format(''.join(items))
|
||||
return str(obj)
|
||||
|
||||
def properties(props):
|
||||
props = {k: v for k, v in copy.deepcopy(props).items() if v}
|
||||
res = []
|
||||
desc = props.pop('dc:description', None)
|
||||
if desc:
|
||||
res.append(desc + '\n')
|
||||
img = props.pop('https://schema.org/image', None)
|
||||
if img:
|
||||
if isinstance(img, str): # pragma: no cover
|
||||
img = {'contentUrl': img}
|
||||
res.append('\n'.format(
|
||||
img.get('https://schema.org/caption') or '',
|
||||
img.get('https://schema.org/contentUrl')))
|
||||
if props:
|
||||
res.append('property | value\n --- | ---')
|
||||
for k, v in props.items():
|
||||
res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k)))
|
||||
return '\n'.join(res) + '\n'
|
||||
|
||||
def colrow(col, fks, pk):
|
||||
dt = '`{}`'.format(col.datatype.base if col.datatype else 'string')
|
||||
if col.datatype:
|
||||
if col.datatype.format:
|
||||
if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format):
|
||||
dt += '<br>Valid choices:<br>'
|
||||
dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|'))
|
||||
elif col.datatype.base == 'string':
|
||||
dt += '<br>Regex: `{}`'.format(col.datatype.format)
|
||||
if col.datatype.minimum:
|
||||
dt += '<br>≥ {}'.format(col.datatype.minimum)
|
||||
if col.datatype.maximum:
|
||||
dt += '<br>≤ {}'.format(col.datatype.maximum)
|
||||
if col.separator:
|
||||
dt = 'list of {} (separated by `{}`)'.format(dt, col.separator)
|
||||
desc = col.common_props.get('dc:description', '').replace('\n', ' ')
|
||||
|
||||
if pk and col.name in pk:
|
||||
desc = (desc + '<br>') if desc else desc
|
||||
desc += 'Primary key'
|
||||
|
||||
if col.name in fks:
|
||||
desc = (desc + '<br>') if desc else desc
|
||||
desc += 'References [{}::{}](#table-{})'.format(
|
||||
fks[col.name][1], fks[col.name][0], slug(fks[col.name][1]))
|
||||
|
||||
return ' | '.join([
|
||||
'[{}]({})'.format(col.name, col.propertyUrl)
|
||||
if col.propertyUrl else '`{}`'.format(col.name),
|
||||
dt,
|
||||
desc,
|
||||
])
|
||||
|
||||
res = ['# {}\n'.format(tg.common_props.get('dc:title', 'Dataset'))]
|
||||
if tg._fname and link_files:
|
||||
res.append('> [!NOTE]\n> Described by [{0}]({0}).\n'.format(tg._fname.name))
|
||||
|
||||
res.append(properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'}))
|
||||
|
||||
for table in tg.tables:
|
||||
fks = {
|
||||
fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string)
|
||||
for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1}
|
||||
header = '## <a name="table-{}"></a>Table '.format(slug(table.url.string))
|
||||
if link_files and tg._fname and tg._fname.parent.joinpath(table.url.string).exists():
|
||||
header += '[{0}]({0})\n'.format(table.url.string)
|
||||
else: # pragma: no cover
|
||||
header += table.url.string
|
||||
res.append('\n' + header + '\n')
|
||||
res.append(properties(table.common_props))
|
||||
dialect = table.inherit('dialect')
|
||||
if dialect.asdict():
|
||||
res.append('\n**CSV dialect**: `{}`\n'.format(json.dumps(dialect.asdict())))
|
||||
res.append('\n### Columns\n')
|
||||
res.append('Name/Property | Datatype | Description')
|
||||
res.append(' --- | --- | --- ')
|
||||
for col in table.tableSchema.columns:
|
||||
res.append(colrow(col, fks, table.tableSchema.primaryKey))
|
||||
return '\n'.join(res)
|
||||
Reference in New Issue
Block a user