2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,27 @@
+# csvw - https://w3c.github.io/csvw/primer/
+
+from .metadata import (
+    TableGroup, Table, Column, ForeignKey, Link, NaturalLanguage, Datatype, URITemplate, CSVW,
+    Dialect)
+
+from .dsv import (UnicodeWriter,
+    UnicodeReader, UnicodeReaderWithLineNumber, UnicodeDictReader, NamedTupleReader,
+    iterrows, rewrite)
+
+__all__ = [
+    'TableGroup',
+    'Table', 'Column', 'ForeignKey',
+    'Link', 'NaturalLanguage',
+    'Datatype',
+    'URITemplate',
+    'Dialect', 'UnicodeWriter',
+    'UnicodeReader', 'UnicodeReaderWithLineNumber', 'UnicodeDictReader', 'NamedTupleReader',
+    'iterrows', 'rewrite',
+    'CSVW',
+]
+
+__title__ = 'csvw'
+__version__ = '3.5.1'
+__author__ = 'Robert Forkel'
+__license__ = 'Apache 2.0, see LICENSE'
+__copyright__ = 'Copyright (c) 2024 Robert Forkel'
@@ -0,0 +1,164 @@
+import sys
+import json
+import shutil
+import pathlib
+import argparse
+import subprocess
+
+from colorama import init, Fore, Style
+
+from csvw import CSVW, TableGroup
+from csvw.db import Database
+from csvw.utils import metadata2markdown
+
+
+def parsed_args(desc, args, *argspecs):
+    if args is None:  # pragma: no cover
+        parser = argparse.ArgumentParser(description=desc)
+        for kw, kwargs in argspecs:
+            parser.add_argument(*kw, **kwargs)
+        return parser.parse_args()
+    return args
+
+
+def exit(ret, test=False):
+    if test:
+        return ret
+    sys.exit(ret)  # pragma: no cover
+
+
+def csvwdescribe(args=None, test=False):
+    frictionless = shutil.which('frictionless')
+    if not frictionless:  # pragma: no cover
+        raise ValueError('The frictionless command must be installed for this functionality!\n'
+                         'Run `pip install frictionless` and try again.')
+
+    args = parsed_args(
+        "Describe a (set of) CSV file(s) with basic CSVW metadata.",
+        args,
+        (['--delimiter'], dict(default=None)),
+        (['csv'], dict(nargs='+', help="CSV files to describe as CSVW TableGroup")),
+    )
+    fargs = ['describe', '--json']
+    if args.delimiter:
+        fargs.extend(['--dialect', '{"delimiter": "%s"}' % args.delimiter])
+    onefile = False
+    if len(args.csv) == 1 and '*' not in args.csv[0]:
+        onefile = True
+        # Make sure we infer a tabular-data schema even if the file suffix does not suggest a CSV
+        # file.
+        fargs.extend(['--format', 'csv'])
+    else:
+        fargs.extend(['--type', 'package'])
+
+    dp = json.loads(subprocess.check_output([frictionless] + fargs + args.csv))
+    if onefile:
+        dp = dict(resources=[dp], profile='data-package')
+
+    tg = TableGroup.from_frictionless_datapackage(dp)
+    print(json.dumps(tg.asdict(), indent=4))
+    return exit(0, test=test)
+
+
+def csvwvalidate(args=None, test=False):
+    init()
+    args = parsed_args(
+        "Validate a (set of) CSV file(s) described by CSVW metadata.",
+        args,
+        (['url'], dict(help='URL or local path to CSV or JSON metadata file.')),
+        (['-v', '--verbose'], dict(action='store_true', default=False)),
+    )
+    ret = 0
+    try:
+        csvw = CSVW(args.url, validate=True)
+        if csvw.is_valid:
+            print(Style.BRIGHT + Fore.GREEN + 'OK')
+        else:
+            ret = 1
+            print(Style.BRIGHT + Fore.RED + 'FAIL')
+            if args.verbose:
+                for w in csvw.warnings:
+                    print(Style.DIM + str(w.message))
+    except ValueError as e:
+        ret = 2
+        print(Style.BRIGHT + Fore.RED + 'FAIL')
+        if args.verbose:
+            print(Style.DIM + Fore.BLUE + str(e))
+    return exit(ret, test=test)
+
+
+def csvw2datasette(args=None, test=False):
+    args = parsed_args(
+        "Convert CSVW to data for datasette (https://datasette.io/).",
+        args,
+        (['url'], dict(help='URL or local path to CSV or JSON metadata file.')),
+        (['-o', '--outdir'], dict(type=pathlib.Path, default=pathlib.Path('.'))),
+    )
+    dbname, mdname = 'datasette.db', 'datasette-metadata.json'
+    csvw = CSVW(args.url)
+    db = Database(csvw.tablegroup, fname=args.outdir / dbname)
+    db.write_from_tg()
+    md = {}
+    for k in ['title', 'description', 'license']:
+        if 'dc:{}'.format(k) in csvw.common_props:
+            md[k] = csvw.common_props['dc:{}'.format(k)]
+    # FIXME: flesh out, see https://docs.datasette.io/en/stable/metadata.html
+    args.outdir.joinpath(mdname).write_text(json.dumps(md, indent=4))
+    print("""Run
+    datasette {} --metadata {}
+and open your browser at
+    http://localhost:8001/
+to browse the data.
+""".format(args.outdir / dbname, args.outdir / mdname))
+    return exit(0, test=test)
+
+
+def csvw2json(args=None, test=False):
+    args = parsed_args(
+        "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/",
+        args,
+        (['url'], dict(help='URL or local path to CSV or JSON metadata file.')),
+    )
+    csvw = CSVW(args.url)
+    print(json.dumps(csvw.to_json(), indent=4))
+    return exit(0, test=test)
+
+
+def csvw2sqlite(args=None, test=False):  # pragma: no cover
+    args = parsed_args(
+        "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/",
+        args,
+        (
+            ['url'],
+            dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n'
+                      'Note that not all valid CSVW datasets can be converted to SQLite. One '
+                      'limitation is that all tables which are referenced by foreign keys must '
+                      'have a primary key.')),
+        (
+            ['output'],
+            dict(help='Path for the generated SQLite database file.')),
+    )
+    tg = TableGroup.from_file(args.url)
+    db = Database(tg, args.output)
+    db.write_from_tg(_force=True)
+    return exit(0, test=test)
+
+
+def csvw2markdown(args=None, test=False):
+    args = parsed_args(
+        "Convert CSVW to JSON, see https://w3c.github.io/csvw/csv2json/",
+        args,
+        (
+            ['url'],
+            dict(help='URL or local path to CSVW metadata file describing a TableGroup.\n\n'
+                      'Note that not all valid CSVW datasets can be converted to SQLite. One '
+                      'limitation is that all tables which are referenced by foreign keys must '
+                      'have a primary key.')),
+    )
+    tg = TableGroup.from_file(args.url)
+    print(metadata2markdown(tg, link_files=True))
+    return exit(0, test=test)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    csvw2json()
@@ -0,0 +1,581 @@
+"""
+SQLite as alternative storage backend for a TableGroup's data.
+
+For the most part, translation of a TableGroup's tableSchema to SQL works as expected:
+
+- each table is converted to a `CREATE TABLE` statement
+- each column specifies a column in the corresponding `CREATE TABLE` statement
+- `foreignKey` constraints are added according to the corresponding `tableSchema` property.
+
+List-valued foreignKeys are supported as follows: For each pair of tables related through a
+list-valued foreign key, an association table is created. To make it possible to distinguish
+multiple list-valued foreign keys between the same two tables, the association table has
+a column `context`, which stores the name of the foreign key column from which a row in the
+assocation table was created.
+
+Other list-valued columns work in two different ways: If the atomic datatype is `string`, the
+specified separator is used to create a concatenated string representation in the database field.
+Otherwise, the list of values is serialized as JSON.
+
+SQL table and column names can be customized by passing a translator callable when instantiating
+a :class:`Database`.
+
+SQLite support has the following limitations:
+
+- regex constraints on strings (as specified via a :class:`csvw.Datatype`'s format attribute) are
+  not enforced by the database.
+"""
+import json
+import typing
+import decimal
+import pathlib
+import sqlite3
+import functools
+import contextlib
+import collections
+
+import attr
+
+import csvw
+from csvw.datatypes import DATATYPES
+from csvw.metadata import TableGroup
+
+
+def identity(s):
+    return s
+
+
+TYPE_MAP = {
+    'string': (
+        'TEXT',
+        identity,
+        identity),
+    'integer': (
+        'INTEGER',
+        identity,
+        identity),
+    'boolean': (
+        'INTEGER',
+        lambda s: s if s is None else int(s),
+        lambda s: s if s is None else bool(s)),
+    'decimal': (
+        'REAL',
+        lambda s: s if s is None else float(s),
+        lambda s: s if s is None else decimal.Decimal(s)),
+    'hexBinary': (
+        'BLOB',
+        identity,
+        identity),
+}
+
+
+class SchemaTranslator(typing.Protocol):
+    def __call__(self, table: str, column: typing.Optional[str] = None) -> str:
+        ...  # pragma: no cover
+
+
+class ColumnTranslator(typing.Protocol):
+    def __call__(self, column: str) -> str:
+        ...  # pragma: no cover
+
+
+def quoted(*names):
+    return ','.join('`{0}`'.format(name) for name in names)
+
+
+def insert(db: sqlite3.Connection,
+           translate: SchemaTranslator,
+           table: str,
+           keys: typing.Sequence[str],
+           *rows: list,
+           single: typing.Optional[bool] = False):
+    """
+    Insert a sequence of rows into a table.
+
+    :param db: Database connection.
+    :param translate: Callable translating table and column names to proper schema object names.
+    :param table: Untranslated table name.
+    :param keys: Untranslated column names.
+    :param rows: Sequence of rows to insert.
+    :param single: Flag signaling whether to insert all rows at once using `executemany` or one at \
+    a time, allowing for more focused debugging output in case of errors.
+    """
+    if rows:
+        sql = "INSERT INTO {0} ({1}) VALUES ({2})".format(
+            quoted(translate(table)),
+            quoted(*[translate(table, k) for k in keys]),
+            ','.join(['?' for _ in keys]))
+        try:
+            db.executemany(sql, rows)
+        except:  # noqa: E722 - this is purely for debugging.
+            if not single:
+                for row in rows:
+                    insert(db, translate, table, keys, row, single=True)
+            else:
+                print(sql)
+                print(rows)
+                raise
+
+
+def select(db: sqlite3.Connection, table: str) -> typing.Tuple[typing.List[str], typing.Sequence]:
+    cu = db.execute("SELECT * FROM {0}".format(quoted(table)))
+    cols = [d[0] for d in cu.description]
+    return cols, list(cu.fetchall())
+
+
+@attr.s
+class ColSpec:
+    """
+    A `ColSpec` captures sufficient information about a :class:`csvw.Column` for the DB schema.
+    """
+    name = attr.ib()
+    csvw_type = attr.ib(default='string', converter=lambda s: s if s else 'string')
+    separator = attr.ib(default=None)
+    db_type = attr.ib(default=None)
+    convert = attr.ib(default=None)
+    read = attr.ib(default=None)
+    required = attr.ib(default=False)
+    csvw = attr.ib(default=None)
+
+    def __attrs_post_init__(self):
+        if self.csvw_type in TYPE_MAP:
+            self.db_type, self.convert, self.read = TYPE_MAP[self.csvw_type]
+        else:
+            self.db_type = 'TEXT'
+            self.convert = DATATYPES[self.csvw_type].to_string
+            self.read = DATATYPES[self.csvw_type].to_python
+        if self.separator and self.db_type != 'TEXT':
+            self.db_type = 'TEXT'
+
+    def check(self, translate: ColumnTranslator) -> typing.Optional[str]:
+        """
+        We try to convert as many data constraints as possible into SQLite CHECK constraints.
+
+        :param translate: Callable to translate column names between CSVW metadata and DB schema.
+        :return: A string suitable as argument of an SQL CHECK constraint.
+        """
+        if not self.csvw:
+            return
+        c, cname = self.csvw, translate(self.name)
+        constraints = []
+        if (c.minimum is not None) or (c.maximum is not None):
+            func = {
+                'date': 'date',
+                'datetime': 'datetime',
+            }.get(self.csvw_type)
+            if c.minimum is not None:
+                if func:
+                    constraints.append("{2}(`{0}`) >= {2}('{1}')".format(cname, c.minimum, func))
+                else:
+                    constraints.append('`{0}` >= {1}'.format(cname, c.minimum))
+            if c.maximum is not None:
+                if func:
+                    constraints.append("{2}(`{0}`) <= {2}('{1}')".format(cname, c.maximum, func))
+                else:
+                    constraints.append('`{0}` <= {1}'.format(cname, c.maximum))
+        elif any(cc is not None for cc in [c.length, c.minLength, c.maxLength]):
+            if c.length:
+                constraints.append('length(`{0}`) = {1}'.format(cname, c.length))
+            if c.minLength:
+                constraints.append('length(`{0}`) >= {1}'.format(cname, c.minLength))
+            if c.maxLength:
+                constraints.append('length(`{0}`) <= {1}'.format(cname, c.maxLength))
+        return ' AND '.join(constraints)
+
+    def sql(self, translate: ColumnTranslator) -> str:
+        _check = self.check(translate)
+        return '`{0}` {1}{2}{3}'.format(
+            translate(self.name),
+            self.db_type,
+            ' NOT NULL' if self.required else '',
+            ' CHECK ({0})'.format(_check) if _check else '')
+
+
+@attr.s
+class TableSpec(object):
+    """
+    A `TableSpec` captures sufficient information about a :class:`csvw.Table` for the DB schema.
+
+    .. note::
+
+        We support "light-weight" many-to-many relationships by allowing list-valued foreign key
+        columns in CSVW. In the database these columns are turned into an associative table, adding
+        the name of the column as value a `context` column. Thus, multiple columns in a table my be
+        specified as targets of many-to-many relations with the same table.
+
+        .. seealso:: `<https://en.wikipedia.org/wiki/Associative_entity>`_
+    """
+    name = attr.ib()
+    columns = attr.ib(default=attr.Factory(list))
+    foreign_keys = attr.ib(default=attr.Factory(list))
+    many_to_many = attr.ib(default=attr.Factory(collections.OrderedDict))
+    primary_key = attr.ib(default=None)
+
+    @classmethod
+    def from_table_metadata(cls,
+                            table: csvw.Table,
+                            drop_self_referential_fks: typing.Optional[bool] = True) -> 'TableSpec':
+        """
+        Create a `TableSpec` from the schema description of a `csvw.metadata.Table`.
+
+        :param table: `csvw.metadata.Table` instance.
+        :param drop_self_referential_fks: Flag signaling whether to drop self-referential foreign \
+        keys. This may be necessary, if the order of rows in a CSVW table does not guarantee \
+        referential integrity when inserted in order (e.g. an eralier row refering to a later one).
+        :return: `TableSpec` instance.
+        """
+        spec = cls(name=table.local_name, primary_key=table.tableSchema.primaryKey)
+        list_valued = {c.header for c in table.tableSchema.columns if c.separator}
+        for fk in table.tableSchema.foreignKeys:
+            # We only support Foreign Key references between tables!
+            if not fk.reference.schemaReference:
+                if len(fk.columnReference) == 1 and fk.columnReference[0] in list_valued:
+                    # List-valued foreign keys are turned into a many-to-many relation!
+                    assert len(fk.reference.columnReference) == 1, \
+                        'Composite key {0} in table {1} referenced'.format(
+                            fk.reference.columnReference,
+                            fk.reference.resource)
+                    assert spec.primary_key and len(spec.primary_key) == 1, \
+                        'Table {0} referenced by list-valued foreign key must have non-composite ' \
+                        'primary key'.format(spec.name)
+                    spec.many_to_many[fk.columnReference[0]] = TableSpec.association_table(
+                        spec.name,
+                        spec.primary_key[0],
+                        fk.reference.resource.string,
+                        fk.reference.columnReference[0],
+                    )
+                elif not (drop_self_referential_fks and fk.reference.resource.string == spec.name):
+                    spec.foreign_keys.append((
+                        sorted(fk.columnReference),
+                        fk.reference.resource.string,
+                        sorted(fk.reference.columnReference),
+                    ))
+        for c in table.tableSchema.columns:
+            if c.header not in spec.many_to_many:
+                datatype = c.inherit('datatype')
+                spec.columns.append(ColSpec(
+                    name=c.header,
+                    csvw_type=datatype.base if datatype else datatype,
+                    separator=c.inherit('separator'),
+                    required=c.inherit('required'),
+                    csvw=c.inherit('datatype'),
+                ))
+        return spec
+
+    @classmethod
+    def association_table(cls, atable, apk, btable, bpk) -> 'TableSpec':
+        """
+        List-valued foreignKeys are supported as follows: For each pair of tables related through a
+        list-valued foreign key, an association table is created. To make it possible to distinguish
+        multiple list-valued foreign keys between the same two tables, the association table has
+        a column `context`, which stores the name of the foreign key column from which a row in the
+        assocation table was created.
+        """
+        afk = ColSpec('{0}_{1}'.format(atable, apk))
+        bfk = ColSpec('{0}_{1}'.format(btable, bpk))
+        if afk.name == bfk.name:
+            afk.name += '_1'
+            bfk.name += '_2'
+        return cls(
+            name='{0}_{1}'.format(atable, btable),
+            columns=[afk, bfk, ColSpec('context')],
+            foreign_keys=[
+                ([afk.name], atable, [apk]),
+                ([bfk.name], btable, [bpk]),
+            ]
+        )
+
+    def sql(self, translate: SchemaTranslator) -> str:
+        """
+        :param translate:
+        :return: The SQL statement to create the table.
+        """
+        col_translate = functools.partial(translate, self.name)
+        clauses = [col.sql(col_translate) for col in self.columns]
+        if self.primary_key:
+            clauses.append('PRIMARY KEY({0})'.format(quoted(
+                *[col_translate(c) for c in self.primary_key])))
+        for fk, ref, refcols in self.foreign_keys:
+            clauses.append('FOREIGN KEY({0}) REFERENCES {1}({2}) ON DELETE CASCADE'.format(
+                quoted(*[col_translate(c) for c in fk]),
+                quoted(translate(ref)),
+                quoted(*[translate(ref, c) for c in refcols])))
+        return "CREATE TABLE IF NOT EXISTS `{0}` (\n    {1}\n)".format(
+            translate(self.name), ',\n    '.join(clauses))
+
+
+def schema(tg: csvw.TableGroup,
+           drop_self_referential_fks: typing.Optional[bool] = True) -> typing.List[TableSpec]:
+    """
+    Convert the table and column descriptions of a `TableGroup` into specifications for the
+    DB schema.
+
+    :param tg: CSVW TableGroup.
+    :param drop_self_referential_fks: Flag signaling whether to drop self-referential foreign \
+    keys. This may be necessary, if the order of rows in a CSVW table does not guarantee \
+    referential integrity when inserted in order (e.g. an eralier row refering to a later one).
+    :return: A pair (tables, reference_tables).
+    """
+    tables = {}
+    for tname, table in tg.tabledict.items():
+        t = TableSpec.from_table_metadata(
+            table, drop_self_referential_fks=drop_self_referential_fks)
+        tables[t.name] = t
+        for at in t.many_to_many.values():
+            tables[at.name] = at
+
+    # We must determine the order in which tables must be created!
+    ordered = collections.OrderedDict()
+    i = 0
+
+    # We loop through the tables repeatedly, and whenever we find one, which has all
+    # referenced tables already in ordered, we move it from tables to ordered.
+    while tables and i < 100:
+        i += 1
+        for table in list(tables.keys()):
+            if all((ref[1] in ordered) or ref[1] == table for ref in tables[table].foreign_keys):
+                # All referenced tables are already created (or self-referential).
+                ordered[table] = tables.pop(table)
+                break
+    if tables:  # pragma: no cover
+        raise ValueError('there seem to be cyclic dependencies between the tables')
+
+    return list(ordered.values())
+
+
+class Database(object):
+    """
+    Represents a SQLite database associated with a :class:`csvw.TableGroup` instance.
+
+    :param tg: `TableGroup` instance defining the schema of the database.
+    :param fname: Path to which to write the database file.
+    :param translate: Schema object name translator.
+    :param drop_self_referential_fks: Flag signaling whether to drop or enforce self-referential \
+    foreign-key constraints.
+
+    .. warning::
+
+        We write rows of a table to the database sequentially. Since CSVW does not require ordering
+        rows in tables such that self-referential foreign-key constraints are satisfied at each row,
+        we don't enforce self-referential foreign-keys by default in order to not trigger "false"
+        integrity errors. If data in a CSVW Table is known to be ordered appropriately, `False`
+        should be passed as `drop_self_referential_fks` keyword parameter to enforce
+        self-referential foreign-keys.
+    """
+    def __init__(
+            self,
+            tg: TableGroup,
+            fname: typing.Optional[typing.Union[pathlib.Path, str]] = None,
+            translate: typing.Optional[SchemaTranslator] = None,
+            drop_self_referential_fks: typing.Optional[bool] = True,
+    ):
+        self.translate = translate or Database.name_translator
+        self.fname = pathlib.Path(fname) if fname else None
+        self.init_schema(tg, drop_self_referential_fks=drop_self_referential_fks)
+        self._connection = None  # For in-memory dbs we need to keep the connection!
+
+    def init_schema(self, tg, drop_self_referential_fks=True):
+        self.tg = tg
+        self.tables = schema(
+            self.tg, drop_self_referential_fks=drop_self_referential_fks) if self.tg else []
+
+    @property
+    def tdict(self) -> typing.Dict[str, TableSpec]:
+        return {t.name: t for t in self.tables}
+
+    @staticmethod
+    def name_translator(table: str, column: typing.Optional[str] = None) -> str:
+        """
+        A callable with this signature can be passed into DB creation to control the names
+        of the schema objects.
+
+        :param table: CSVW name of the table before translation
+        :param column: CSVW name of a column of `table` before translation
+        :return: Translated table name if `column is None` else translated column name
+        """
+        # By default, no translation is done:
+        return column or table
+
+    def connection(self) -> typing.Union[sqlite3.Connection, contextlib.closing]:
+        if self.fname:
+            return contextlib.closing(sqlite3.connect(str(self.fname)))
+        if not self._connection:
+            self._connection = sqlite3.connect(':memory:')
+        return self._connection
+
+    def select_many_to_many(self, db, table, context) -> dict:
+        if context is not None:
+            context_sql = "WHERE context = '{0}'".format(context)
+        else:
+            context_sql = ''
+        sql = """\
+SELECT {0}, group_concat({1}, ' '), group_concat(COALESCE(context, ''), '||')
+FROM {2} {3} GROUP BY {0}""".format(
+                quoted(self.translate(table.name, table.columns[0].name)),
+                quoted(self.translate(table.name, table.columns[1].name)),
+                quoted(self.translate(table.name)),
+                context_sql)
+        cu = db.execute(sql)
+        return {
+            r[0]: [(k, v) if context is None else k
+                   for k, v in zip(r[1].split(), r[2].split('||'))] for r in cu.fetchall()}
+
+    def separator(self, tname: str, cname: str) -> typing.Optional[str]:
+        """
+        :return: separator for the column specified by db schema names `tname` and `cname`.
+        """
+        for name in self.tdict:
+            if self.translate(name) == tname:
+                for col in self.tdict[name].columns:
+                    if self.translate(name, col.name) == cname:
+                        return col.separator
+
+    def split_value(self, tname, cname, value) -> typing.Union[typing.List[str], str, None]:
+        sep = self.separator(tname, cname)
+        return (value or '').split(sep) if sep else value
+
+    def read(self) -> typing.Dict[str, typing.List[typing.OrderedDict]]:
+        """
+        :return: A `dict` where keys are SQL table names corresponding to CSVW tables and values \
+        are lists of rows, represented as dicts where keys are the SQL column names.
+        """
+        res = collections.defaultdict(list)
+        with self.connection() as conn:
+            for tname in self.tg.tabledict:
+                #
+                # FIXME: how much do we want to use DB types? Probably as much as possible!
+                # Thus we need to convert on write **and** read!
+                #
+                convert, seps, refs = {}, {}, collections.defaultdict(dict)
+                table = self.tdict[tname]  # The TableSpec object.
+
+                # Assemble the conversion dictionary:
+                for col in table.columns:
+                    convert[self.translate(tname, col.name)] = [col.name, identity]
+                    if col.csvw_type in TYPE_MAP:
+                        convert[self.translate(tname, col.name)][1] = TYPE_MAP[col.csvw_type][2]
+                    else:
+                        convert[self.translate(tname, col.name)][1] = \
+                            DATATYPES[col.csvw_type].to_python
+                    if col.separator:
+                        if col.csvw_type == 'string':
+                            seps[self.translate(tname, col.name)] = col.separator
+                        else:
+                            seps[self.translate(tname, col.name)] = 'json'
+
+                # Retrieve the many-to-many relations:
+                for col, at in table.many_to_many.items():
+                    for pk, v in self.select_many_to_many(conn, at, col).items():
+                        refs[pk][self.translate(tname, col)] = v
+
+                cols, rows = select(conn, self.translate(tname))
+                for row in rows:
+                    d = collections.OrderedDict()
+                    for k, v in zip(cols, row):
+                        if k in seps:
+                            if v is None:
+                                d[k] = None
+                            elif not v:
+                                d[k] = []
+                            elif seps[k] == 'json':
+                                d[k] = json.loads(v)
+                            else:
+                                d[k] = [convert[k][1](v_) for v_ in (v or '').split(seps[k])]
+                        else:
+                            d[k] = convert[k][1](v) if v is not None else None
+                    pk = d[self.translate(tname, table.primary_key[0])] \
+                        if table.primary_key and len(table.primary_key) == 1 else None
+                    d.update({k: [] for k in table.many_to_many})
+                    d.update(refs.get(pk, {}))
+                    res[self.translate(tname)].append(d)
+        return res
+
+    def association_table_context(self, table, column, fkey):
+        """
+        Context for association tables is created calling this method.
+
+        Note: If a custom value for the `context` column is created by overwriting this method,
+        `select_many_to_many` must be adapted accordingly, to make sure the custom
+        context is retrieved when reading the data from the db.
+
+        :param table:
+        :param column:
+        :param fkey:
+        :return: a pair (foreign key, context)
+        """
+        # The default implementation takes the column name as context:
+        return fkey, column
+
+    def write_from_tg(self, _force=False, _exists_ok=False, _skip_extra=False):
+        return self.write(
+            force=_force,
+            _exists_ok=_exists_ok,
+            _skip_extra=_skip_extra,
+            **self.tg.read())
+
+    def write(self, *, force=False, _exists_ok=False, _skip_extra=False, **items):
+        """
+        Creates a db file with the core schema.
+
+        :param force: If `True` an existing db file will be overwritten.
+        """
+        if self.fname and self.fname.exists():
+            if not force:
+                raise ValueError('db file already exists, use force=True to overwrite')
+            else:
+                self.fname.unlink()
+
+        with self.connection() as db:
+            for table in self.tables:
+                db.execute(table.sql(translate=self.translate))
+
+            db.execute('PRAGMA foreign_keys = ON;')
+            db.commit()
+
+            refs = collections.defaultdict(list)  # collects rows in association tables.
+            for t in self.tables:
+                if t.name not in items:
+                    continue
+                rows, keys = [], []
+                cols = {c.name: c for c in t.columns}
+                for i, row in enumerate(items[t.name]):
+                    pk = row[t.primary_key[0]] \
+                        if t.primary_key and len(t.primary_key) == 1 else None
+                    values = []
+                    for k, v in row.items():
+                        if k in t.many_to_many:
+                            assert pk
+                            at = t.many_to_many[k]
+                            atkey = tuple([at.name] + [c.name for c in at.columns])
+                            # We distinguish None - meaning NULL - and [] - meaning no items - as
+                            # values of list-valued columns.
+                            for vv in (v or []):
+                                fkey, context = self.association_table_context(t, k, vv)
+                                refs[atkey].append((pk, fkey, context))
+                        else:
+                            if k not in cols:
+                                if _skip_extra:
+                                    continue
+                                else:
+                                    raise ValueError(
+                                        'unspecified column {0} found in data'.format(k))
+                            col = cols[k]
+                            if isinstance(v, list):
+                                # Note: This assumes list-valued columns are of datatype string!
+                                if col.csvw_type == 'string':
+                                    v = (col.separator or ';').join(
+                                        col.convert(vv) or '' for vv in v)
+                                else:
+                                    v = json.dumps(v)
+                            else:
+                                v = col.convert(v) if v is not None else None
+                            if i == 0:
+                                keys.append(col.name)
+                            values.append(v)
+                    rows.append(tuple(values))
+                insert(db, self.translate, t.name, keys, *rows)
+
+            for atkey, rows in refs.items():
+                insert(db, self.translate, atkey[0], atkey[1:], *rows)
+
+            db.commit()
@@ -0,0 +1,441 @@
+"""Support for reading delimiter-separated value files.
+
+This module contains unicode aware replacements for :func:`csv.reader`
+and :func:`csv.writer`. It was stolen/extracted from the ``csvkit``
+project to allow re-use when the whole ``csvkit`` package isn't
+required.
+
+The original implementations were largely copied from
+`examples in the csv module documentation <http://docs.python.org/library/csv.html\
+#examples>`_.
+
+.. seealso:: http://en.wikipedia.org/wiki/Delimiter-separated_values
+"""
+import io
+import csv
+import codecs
+import shutil
+import typing
+import pathlib
+import tempfile
+import warnings
+import functools
+import collections
+
+from . import utils
+from .dsv_dialects import Dialect
+
+__all__ = [
+    'UnicodeWriter',
+    'UnicodeReader', 'UnicodeReaderWithLineNumber', 'UnicodeDictReader', 'NamedTupleReader',
+    'iterrows',
+    'rewrite', 'add_rows', 'filter_rows_as_dict',
+]
+
+LINES_OR_PATH = typing.Union[str, pathlib.Path, typing.IO, typing.Iterable[str]]
+
+
+def normalize_encoding(encoding: str) -> str:
+    return codecs.lookup(encoding).name
+
+
+class UnicodeWriter:
+    """
+    Write Unicode data to a csv file.
+
+    :param f: The target to which to write the data; a local path specified as `str` or \
+    `pathlib.Path` or `None`, in which case the data, formatted as DSV can be retrieved \
+    via :meth:`~UnicodeWriter.read`
+    :param dialect: Either a dialect name as recognized by `csv.writer` or a \
+    :class:`~Dialect` instance for dialect customization beyond what can be done with \
+    `csv.writer`.
+    :param kw: Keyword arguments passed through to `csv.writer`.
+
+    .. code-block:: python
+
+        >>> from csvw import UnicodeWriter
+        >>> with UnicodeWriter('data.tsv', delimiter='\t') as writer:
+        ...     writer.writerow(['ä', 'ö', 'ü'])
+    """
+
+    def __init__(
+            self,
+            f: typing.Optional[typing.Union[str, pathlib.Path]] = None,
+            dialect: typing.Optional[typing.Union[Dialect, str]] = None,
+            **kw):
+        self.f = f
+        self.encoding = kw.pop('encoding', 'utf-8')
+        if isinstance(dialect, Dialect):
+            self.encoding = dialect.python_encoding
+            self.kw = dialect.as_python_formatting_parameters()
+            self.kw.update(kw)
+        else:
+            self.kw = kw
+            if dialect:
+                self.kw['dialect'] = dialect
+        self.encoding = normalize_encoding(self.encoding)
+        self.escapechar = self.kw.get('escapechar')
+        if self.escapechar and self.kw.get('quoting') != csv.QUOTE_NONE:
+            # work around https://bugs.python.org/issue12178
+            # (csv.writer doesn't escape escapechar while csv.reader expects it)
+            def _escapedoubled(row,
+                               _type=str,
+                               _old=self.escapechar,
+                               _new=2 * self.escapechar):
+                return [s.replace(_old, _new) if isinstance(s, _type) else s for s in row]
+        else:
+            def _escapedoubled(row):
+                return row
+        self._escapedoubled = _escapedoubled
+        self._close = False
+
+    def __enter__(self):
+        if isinstance(self.f, (str, pathlib.Path)):
+            if isinstance(self.f, pathlib.Path):
+                self.f = str(self.f)
+
+            self.f = io.open(self.f, 'wt', encoding=self.encoding, newline='')
+            self._close = True
+        elif self.f is None:
+            self.f = io.StringIO(newline='')
+
+        self.writer = csv.writer(self.f, **self.kw)
+        return self
+
+    def read(self) -> typing.Optional[bytes]:
+        """
+        If the writer has been initialized passing `None` as target, the CSV data as `bytes` can be
+        retrieved calling this method.
+        """
+        if hasattr(self.f, 'seek'):
+            self.f.seek(0)
+        if hasattr(self.f, 'read'):
+            return self.f.read().encode('utf-8')
+
+    def __exit__(self, type, value, traceback):
+        if self._close:
+            self.f.close()
+
+    def writerow(self, row: typing.Union[tuple, list]):
+        self.writer.writerow(self._escapedoubled(row))
+
+    def writerows(self, rows: typing.Iterable[typing.Union[tuple, list]]):
+        for row in rows:
+            self.writerow(row)
+
+
+class UnicodeReader:
+    """
+    Read Unicode data from a csv file.
+
+    :param f: The source from which to read the data; a local path specified as `str` or \
+    `pathlib.Path`, a file-like object or a `list` of lines.
+    :param dialect: Either a dialect name as recognized by `csv.reader` or a \
+    :class:`~Dialect` instance for dialect customization beyond what can be done with \
+    `csv.writer`.
+    :param kw: Keyword arguments passed through to `csv.reader`.
+
+    .. code-block:: python
+
+        >>> with UnicodeReader('tests/fixtures/frictionless-data.csv', delimiter='|') as reader:
+        ...     for row in reader:
+        ...         print(row)
+        ...         break
+        ...
+        ['FK', 'Year', 'Location name', 'Value', 'binary', 'anyURI', 'email', 'boolean', 'array',
+        'geojson']
+    """
+    def __init__(
+            self,
+            f: LINES_OR_PATH,
+            dialect: typing.Optional[typing.Union[Dialect, str]] = None,
+            **kw):
+        self.f = f
+        self.encoding = normalize_encoding(kw.pop('encoding', 'utf-8-sig'))
+        self.newline = kw.pop('lineterminator', None)
+        self.dialect = dialect if isinstance(dialect, Dialect) else None
+        if self.dialect:
+            self.encoding = self.dialect.python_encoding
+            self.kw = dialect.as_python_formatting_parameters()
+            self.kw.update(kw)
+        else:
+            self.kw = kw
+            if dialect:
+                self.kw['dialect'] = dialect
+        self._close = False
+        self.comments = []
+
+        # We potentially screw people with valid CSV files where the content - presumably the
+        # header - starts with 0xfeff. But the chance of irritating people trying to read Excel
+        # exported CSV with the defaults seems way bigger - and anyone with CSV column names
+        # starting with 0xfeff will run into more trouble down the line anyway ...
+        if self.encoding == 'utf-8':
+            self.encoding = 'utf-8-sig'
+
+        # encoding of self.reader rows: differs from source encoding
+        # where we need to recode from non-8bit clean source encoding
+        # to utf-8 first to feed into the (byte-based) PY2 csv.reader
+        self._reader_encoding = self.encoding
+
+    def __enter__(self):
+        if isinstance(self.f, (str, pathlib.Path)):
+            if isinstance(self.f, pathlib.Path):
+                self.f = str(self.f)
+
+            self.f = io.open(self.f, mode='rt', encoding=self.encoding, newline=self.newline or '')
+            self._close = True
+        elif not hasattr(self.f, 'read'):
+            lines = []
+            for line in self.f:
+                lines.append(line.decode(self.encoding) if isinstance(line, bytes) else line)
+            self.f = lines
+        self.reader = csv.reader(self.f, **self.kw)
+        self.lineno = -1
+        return self
+
+    def _next_row(self):
+        self.lineno += 1
+        row = [
+            s if isinstance(s, str) else s.decode(self._reader_encoding)
+            for s in next(self.reader)]
+        self.lineno += sum([list(s).count('\n') for s in row])
+        return row
+
+    def __next__(self):
+        row = self._next_row()
+        if self.dialect:
+            while (row and self.dialect.commentPrefix and  # noqa: W504
+                   row[0].startswith(self.dialect.commentPrefix)) or \
+                    ((not row or set(row) == {''}) and self.dialect.skipBlankRows) or \
+                    (self.lineno < self.dialect.skipRows):
+                if (row and self.dialect.commentPrefix and  # noqa: W504
+                        row[0].startswith(self.dialect.commentPrefix)) or \
+                        (row and self.lineno < self.dialect.skipRows):
+                    self.comments.append((
+                        self.lineno,
+                        self.dialect.delimiter.join(row).lstrip(self.dialect.commentPrefix).strip(),
+                    ))
+                row = self._next_row()
+            row = [self.dialect.trimmer(s) for s in row][self.dialect.skipColumns:]
+        return row
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._close:
+            self.f.close()
+
+    def __iter__(self):
+        return self
+
+
+class UnicodeReaderWithLineNumber(UnicodeReader):
+    """
+    A `UnicodeReader` yielding (lineno, row) pairs, where "lineno" is the 1-based number of the
+    the **text line** where the (possibly multi-line) row data starts in the DSV file.
+    """
+    def __next__(self):
+        """
+        :return: a pair (1-based line number in the input, row)
+        """
+        # Retrieve the row, thereby incrementing the line number:
+        row = super(UnicodeReaderWithLineNumber, self).__next__()
+        return self.lineno + 1, row
+
+
+class UnicodeDictReader(UnicodeReader):
+    """
+    A `UnicodeReader` yielding one `dict` per row.
+
+    :param f: As for :class:`UnicodeReader`
+    :param fieldnames:
+
+    .. code-block:: python
+
+        >>> with UnicodeDictReader(
+        ...         'tests/fixtures/frictionless-data.csv',
+        ...         dialect=Dialect(delimiter='|', header=False),
+        ...         fieldnames=[str(i) for i in range(1, 11)]) as reader:
+        ...     for row in reader:
+        ...         print(row)
+        ...         break
+        ...
+        OrderedDict([('1', 'FK'), ('2', 'Year'), ('3', 'Location name'), ('4', 'Value'),
+        ('5', 'binary'), ('6', 'anyURI'), ('7', 'email'), ('8', 'boolean'), ('9', 'array'),
+        ('10', 'geojson')])
+
+    """
+
+    def __init__(self, f, fieldnames=None, restkey=None, restval=None, **kw):
+        self._fieldnames = fieldnames   # list of keys for the dict
+        self.restkey = restkey          # key to catch long rows
+        self.restval = restval          # default value for short rows
+        self.line_num = 0
+        super(UnicodeDictReader, self).__init__(f, **kw)
+
+    @property
+    def fieldnames(self):
+        if self._fieldnames is None:
+            try:
+                self._fieldnames = super(UnicodeDictReader, self).__next__()
+            except StopIteration:
+                pass
+        self.line_num = self.reader.line_num
+        if self._fieldnames:
+            if len(set(self._fieldnames)) != len(self._fieldnames):
+                warnings.warn('Duplicate column names!')
+        return self._fieldnames
+
+    def __next__(self):
+        if self.line_num == 0:
+            # Used only for its side effect.
+            self.fieldnames
+        row = super(UnicodeDictReader, self).__next__()
+        self.line_num = self.reader.line_num
+
+        # unlike the basic reader, we prefer not to return blanks,
+        # because we will typically wind up with a dict full of None
+        # values
+        while row == []:
+            row = super(UnicodeDictReader, self).__next__()
+        return self.item(row)
+
+    def item(self, row):
+        d = collections.OrderedDict((k, v) for k, v in zip(self.fieldnames, row))
+        lf = len(self.fieldnames)
+        lr = len(row)
+        if lf < lr:
+            d[self.restkey] = row[lf:]
+        elif lf > lr:
+            for key in self.fieldnames[lr:]:
+                d[key] = self.restval
+        return d
+
+
+class NamedTupleReader(UnicodeDictReader):
+    """
+    A `UnicodeReader` yielding one `namedtuple` per row.
+
+    .. note::
+
+        This reader has some limitations, notably that fieldnames must be normalized to be
+        admissible Python names, but also bad performance (compared with `UnicodeDictReader`).
+    """
+
+    _normalize_fieldname = staticmethod(utils.normalize_name)
+
+    @functools.cached_property
+    def cls(self):
+        fieldnames = list(map(self._normalize_fieldname, self.fieldnames))
+        return collections.namedtuple('Row', fieldnames)
+
+    def item(self, row):
+        d = UnicodeDictReader.item(self, row)
+        for name in self.fieldnames:
+            d.setdefault(name, None)
+        return self.cls(
+            **{self._normalize_fieldname(k): v for k, v in d.items() if k in self.fieldnames})
+
+
+def iterrows(lines_or_file: LINES_OR_PATH,
+             namedtuples: typing.Optional[bool] = False,
+             dicts: typing.Optional[bool] = False,
+             encoding: typing.Optional[str] = 'utf-8',
+             **kw) -> typing.Generator:
+    """Convenience factory function for csv reader.
+
+    :param lines_or_file: Content to be read. Either a file handle, a file path or a list\
+    of strings.
+    :param namedtuples: Yield namedtuples.
+    :param dicts: Yield dicts.
+    :param encoding: Encoding of the content.
+    :param kw: Keyword parameters are passed through to csv.reader.
+    :return: A generator over the rows.
+    """
+    if namedtuples and dicts:
+        raise ValueError('either namedtuples or dicts can be chosen as output format')
+    elif namedtuples:
+        _reader = NamedTupleReader
+    elif dicts:
+        _reader = UnicodeDictReader
+    else:
+        _reader = UnicodeReader
+
+    with _reader(lines_or_file, encoding=encoding, **kw) as r:
+        for item in r:
+            yield item
+
+
+reader = iterrows
+
+
+def rewrite(fname: typing.Union[str, pathlib.Path],
+            visitor: typing.Callable[[int, typing.List[str]], typing.Union[None, typing.List[str]]],
+            **kw):
+    """Utility function to rewrite rows in dsv files.
+
+    :param fname: Path of the dsv file to operate on.
+    :param visitor: A callable that takes a line-number and a row as input and returns a \
+    (modified) row or None to filter out the row.
+    :param kw: Keyword parameters are passed through to csv.reader/csv.writer.
+    """
+    fname = utils.ensure_path(fname)
+    assert fname.is_file()
+    with tempfile.NamedTemporaryFile(delete=False) as fp:
+        tmp = pathlib.Path(fp.name)
+
+    with UnicodeReader(fname, **kw) as reader_:
+        with UnicodeWriter(tmp, **kw) as writer:
+            for i, row in enumerate(reader_):
+                row = visitor(i, row)
+                if row is not None:
+                    writer.writerow(row)
+    shutil.move(str(tmp), str(fname))  # Path.replace is Python 3.3+
+
+
+def add_rows(fname: typing.Union[str, pathlib.Path], *rows: typing.List[str]):
+    with tempfile.NamedTemporaryFile(delete=False) as fp:
+        tmp = pathlib.Path(fp.name)
+
+    fname = utils.ensure_path(fname)
+    with UnicodeWriter(tmp) as writer:
+        if fname.exists():
+            with UnicodeReader(fname) as reader_:
+                for row in reader_:
+                    writer.writerow(row)
+        writer.writerows(rows)
+    shutil.move(str(tmp), str(fname))  # Path.replace is Python 3.3+
+
+
+def filter_rows_as_dict(fname: typing.Union[str, pathlib.Path],
+                        filter_: typing.Callable[[dict], bool],
+                        **kw) -> int:
+    """Rewrite a dsv file, filtering the rows.
+
+    :param fname: Path to dsv file
+    :param filter_: callable which accepts a `dict` with a row's data as single argument\
+    returning a `Boolean` indicating whether to keep the row (`True`) or to discard it \
+    `False`.
+    :param kw: Keyword arguments to be passed `UnicodeReader` and `UnicodeWriter`.
+    :return: The number of rows that have been removed.
+    """
+    filter_ = DictFilter(filter_)
+    rewrite(fname, filter_, **kw)
+    return filter_.removed
+
+
+class DictFilter(object):
+
+    def __init__(self, filter_):
+        self.header = None
+        self.filter = filter_
+        self.removed = 0
+
+    def __call__(self, i, row):
+        if i == 0:
+            self.header = row
+            return row
+        if row:
+            item = dict(zip(self.header, row))
+            if self.filter(item):
+                return row
+            else:
+                self.removed += 1
@@ -0,0 +1,160 @@
+"""
+DSV data can be surprisingly diverse. While Python's `csv` module offers out-of-the-box support
+for the basic formatting parameters, CSVW recognizes a couple more, like `skipColumns` or
+`skipRows`.
+
+.. seealso::
+
+    - `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_
+    - `<https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters>`_
+    - `<https://specs.frictionlessdata.io/csv-dialect/>`_
+"""
+import attr
+import warnings
+import functools
+
+from . import utils
+
+__all__ = ['Dialect']
+
+ENCODING_MAP = {
+    'UTF-8-BOM': 'utf-8-sig',  # Recognize the name of this encoding in R.
+}
+
+
+# FIXME: replace with attrs.validators.ge(0) from attrs 21.3.0
+def _non_negative(instance, attribute, value):
+    if value < 0:  # pragma: no cover
+        raise ValueError('{0} is not a valid {1}'.format(value, attribute.name))
+
+
+non_negative_int = [attr.validators.instance_of(int), _non_negative]
+
+
+def convert_encoding(s):
+    s = utils.converter(str, 'utf-8', s)
+    try:
+        _ = 'x'.encode(ENCODING_MAP.get(s, s))
+        return s
+    except LookupError:
+        warnings.warn('Invalid value for property: {}'.format(s))
+        return 'utf-8'
+
+
+@attr.s
+class Dialect(object):
+    """
+    A CSV dialect specification.
+
+    .. seealso:: `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_
+    """
+
+    encoding = attr.ib(
+        default='utf-8',
+        converter=convert_encoding,
+        validator=attr.validators.instance_of(str))
+
+    lineTerminators = attr.ib(
+        converter=functools.partial(utils.converter, list, ['\r\n', '\n']),
+        default=attr.Factory(lambda: ['\r\n', '\n']))
+
+    quoteChar = attr.ib(
+        converter=functools.partial(utils.converter, str, '"', allow_none=True),
+        default='"',
+    )
+
+    doubleQuote = attr.ib(
+        default=True,
+        converter=functools.partial(utils.converter, bool, True),
+        validator=attr.validators.instance_of(bool))
+
+    skipRows = attr.ib(
+        default=0,
+        converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0),
+        validator=non_negative_int)
+
+    commentPrefix = attr.ib(
+        default='#',
+        converter=functools.partial(utils.converter, str, '#', allow_none=True),
+        validator=attr.validators.optional(attr.validators.instance_of(str)))
+
+    header = attr.ib(
+        default=True,
+        converter=functools.partial(utils.converter, bool, True),
+        validator=attr.validators.instance_of(bool))
+
+    headerRowCount = attr.ib(
+        default=1,
+        converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0),
+        validator=non_negative_int)
+
+    delimiter = attr.ib(
+        default=',',
+        converter=functools.partial(utils.converter, str, ','),
+        validator=attr.validators.instance_of(str))
+
+    skipColumns = attr.ib(
+        default=0,
+        converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0),
+        validator=non_negative_int)
+
+    skipBlankRows = attr.ib(
+        default=False,
+        converter=functools.partial(utils.converter, bool, False),
+        validator=attr.validators.instance_of(bool))
+
+    skipInitialSpace = attr.ib(
+        default=False,
+        converter=functools.partial(utils.converter, bool, False),
+        validator=attr.validators.instance_of(bool))
+
+    trim = attr.ib(
+        default='false',
+        validator=attr.validators.in_(['true', 'false', 'start', 'end']),
+        converter=lambda v: functools.partial(
+            utils.converter,
+            (str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v))
+
+    def updated(self, **kw):
+        res = self.__class__(**attr.asdict(self))
+        for k, v in kw.items():
+            setattr(res, k, v)
+        return res
+
+    @functools.cached_property
+    def escape_character(self):
+        return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\')
+
+    @functools.cached_property
+    def line_terminators(self):
+        return [self.lineTerminators] \
+            if isinstance(self.lineTerminators, str) else self.lineTerminators
+
+    @functools.cached_property
+    def trimmer(self):
+        return {
+            'true': lambda s: s.strip(),
+            'false': lambda s: s,
+            'start': lambda s: s.lstrip(),
+            'end': lambda s: s.rstrip()
+        }[self.trim]
+
+    def asdict(self, omit_defaults=True):
+        return utils.attr_asdict(self, omit_defaults=omit_defaults)
+
+    @property
+    def python_encoding(self):
+        return ENCODING_MAP.get(self.encoding, self.encoding)
+
+    def as_python_formatting_parameters(self):
+        return {
+            'delimiter': self.delimiter,
+            'doublequote': self.doubleQuote,
+            # We have to hack around incompatible ways escape char is interpreted in csvw
+            # and python's csv lib:
+            'escapechar': self.escape_character if not self.doubleQuote else None,
+            'lineterminator': self.line_terminators[0],
+            'quotechar': self.quoteChar,
+            'skipinitialspace': self.skipInitialSpace,
+            'strict': True,
+        }
@@ -0,0 +1,224 @@
+"""
+Functionality to convert tabular data in Frictionless Data Packages to CSVW.
+
+We translate [table schemas](https://specs.frictionlessdata.io/table-schema/) defined
+for [data resources](https://specs.frictionlessdata.io/data-resource/) in a
+[data package](https://specs.frictionlessdata.io/data-package/) to a CVSW TableGroup.
+
+This functionality can be used together with the `frictionless describe` command to add
+CSVW metadata to "raw" CSV tables.
+"""
+import json
+import pathlib
+
+
+def convert_column_spec(spec):
+    """
+    https://specs.frictionlessdata.io/table-schema/#field-descriptors
+
+    :param spec:
+    :return:
+    """
+    typemap = {
+        'year': 'gYear',
+        'yearmonth': 'gYearMonth',
+    }
+
+    titles = [t for t in [spec.get('title')] if t]
+
+    res = {'name': spec['name'], 'datatype': {'base': 'string'}}
+    if 'type' in spec:
+        if spec['type'] == 'string' and spec.get('format') == 'binary':
+            res['datatype']['base'] = 'binary'
+        elif spec['type'] == 'string' and spec.get('format') == 'uri':
+            res['datatype']['base'] = 'anyURI'
+        elif spec['type'] in typemap:
+            res['datatype']['base'] = typemap[spec['type']]
+        elif spec['type'] in [
+            'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration',
+        ]:
+            res['datatype']['base'] = spec['type']
+            if spec['type'] == 'string' and spec.get('format'):
+                res['datatype']['dc:format'] = spec['format']
+            if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'):
+                res['datatype']['format'] = '{}|{}'.format(
+                    spec['trueValues'][0], spec['falseValues'][0])
+            if spec['type'] in ['number', 'integer']:
+                if spec.get('bareNumber') is True:  # pragma: no cover
+                    raise NotImplementedError(
+                        'bareNumber is not supported in CSVW. It may be possible to translate to '
+                        'a number pattern, though. See '
+                        'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/'
+                        '#formats-for-numeric-types')
+                if any(prop in spec for prop in ['decimalChar', 'groupChar']):
+                    res['datatype']['format'] = {}
+                    for p in ['decimalChar', 'groupChar']:
+                        if spec.get(p):
+                            res['datatype']['format'][p] = spec[p]
+        elif spec['type'] in ['object', 'array']:
+            res['datatype']['base'] = 'json'
+            res['datatype']['dc:format'] = 'application/json'
+        elif spec['type'] == 'geojson':
+            res['datatype']['base'] = 'json'
+            res['datatype']['dc:format'] = 'application/geo+json'
+
+    if titles:
+        res['titles'] = titles
+    if 'description' in spec:
+        res['dc:description'] = [spec['description']]
+    if 'rdfType' in spec:
+        res['propertyUrl'] = spec['rdfType']
+
+    constraints = spec.get('constraints', {})
+    for prop in ['required', 'minLength', 'maxLength', 'minimum', 'maximum']:
+        if prop in constraints:
+            res['datatype'][prop] = constraints[prop]
+        if ('pattern' in constraints) and ('format' not in res['datatype']):
+            res['datatype']['format'] = constraints['pattern']
+        # FIXME: we could transform the "enum" constraint for string into
+        # a regular expression in the "format" property.
+    return res
+
+
+def convert_foreignKey(rsc_name, fk, resource_map):
+    """
+    https://specs.frictionlessdata.io/table-schema/#foreign-keys
+    """
+    # Rename "fields" to "columnReference" and map resource name to url (resolving self-referential
+    # foreign keys).
+    return dict(
+        columnReference=fk['fields'],
+        reference=dict(
+            columnReference=fk['reference']['fields'],
+            resource=resource_map[fk['reference']['resource'] or rsc_name],
+        )
+    )
+
+
+def convert_table_schema(rsc_name, schema, resource_map):
+    """
+    :param rsc_name: `name` property of the resource the schema belongs to. Needed to resolve \
+    self-referential foreign keys.
+    :param schema: `dict` parsed from JSON representing a frictionless Table Schema object.
+    :param resource_map: `dict` mapping resource names to resource paths, needed to convert foreign\
+    key constraints.
+    :return: `dict` suitable for instantiating a `csvw.metadata.Schema` object.
+    """
+    res = dict(
+        columns=[convert_column_spec(f) for f in schema['fields']],
+    )
+    for prop in [
+        ('missingValues', 'null'),
+        'primaryKey',
+        'foreignKeys',
+    ]:
+        if isinstance(prop, tuple):
+            prop, toprop = prop
+        else:
+            toprop = prop
+        if prop in schema:
+            res[toprop] = schema[prop]
+            if prop == 'foreignKeys':
+                res[toprop] = [convert_foreignKey(rsc_name, fk, resource_map) for fk in res[toprop]]
+    return res
+
+
+def convert_dialect(rsc):
+    """
+    Limitations: lineTerminator is not supported.
+
+    https://specs.frictionlessdata.io/csv-dialect/
+    """
+    d = rsc.get('dialect', {})
+    # Work around https://github.com/frictionlessdata/frictionless-py/issues/1506
+    if 'csv' in d:
+        d = d['csv']
+    res = {}
+    if d.get('delimiter'):
+        res['delimiter'] = d['delimiter']
+    if rsc.get('encoding'):
+        res['encoding'] = rsc['encoding']
+    for prop in [
+        'delimiter',
+        'quoteChar',
+        'doubleQuote',
+        'skipInitialSpace',
+        'header',
+    ]:
+        if prop in d:
+            res[prop] = d[prop]
+    if 'commentChar' in d:
+        res['commentPrefix'] = d['commentChar']
+    return res
+
+
+class DataPackage:
+    def __init__(self, spec, directory=None):
+        if isinstance(spec, DataPackage):
+            self.json = spec.json
+            self.dir = spec.dir
+            return
+        if isinstance(spec, dict):
+            # already a parsed JSON object
+            self.dir = pathlib.Path(directory or '.')
+        elif isinstance(spec, pathlib.Path):
+            self.dir = directory or spec.parent
+            spec = json.loads(spec.read_text(encoding='utf8'))
+        else:  # assume a JSON formatted string
+            spec = json.loads(spec)
+            self.dir = pathlib.Path(directory or '.')
+
+        self.json = spec
+
+    def to_tablegroup(self, cls=None):
+        from csvw import TableGroup
+
+        md = {'@context': "http://www.w3.org/ns/csvw"}
+        # Package metadata:
+        md['dc:replaces'] = json.dumps(self.json)
+
+        # version,
+        # image,
+
+        for flprop, csvwprop in [
+            ('id', 'dc:identifier'),
+            ('licenses', 'dc:license'),
+            ('title', 'dc:title'),
+            ('homepage', 'dcat:accessURL'),
+            ('description', 'dc:description'),
+            ('sources', 'dc:source'),
+            ('contributors', 'dc:contributor'),
+            ('profile', 'dc:conformsTo'),
+            ('keywords', 'dc:subject'),
+            ('created', 'dc:created'),
+        ]:
+            if flprop in self.json:
+                md[csvwprop] = self.json[flprop]
+
+        if 'name' in self.json:
+            if 'id' not in self.json:
+                md['dc:identifier'] = self.json['name']
+            elif 'title' not in self.json:
+                md['dc:title'] = self.json['name']
+
+        # Data Resource metadata:
+        resources = [rsc for rsc in self.json.get('resources', []) if 'path' in rsc]
+        resource_map = {rsc['name']: rsc['path'] for rsc in resources if 'name' in rsc}
+        for rsc in resources:
+            schema = rsc.get('schema')
+            if schema and \
+                    rsc.get('scheme') == 'file' and \
+                    rsc.get('format') == 'csv':
+                # Table Schema:
+                md.setdefault('tables', [])
+                table = dict(
+                    url=rsc['path'],
+                    tableSchema=convert_table_schema(rsc.get('name'), schema, resource_map),
+                    dialect=convert_dialect(rsc),
+                )
+                md['tables'].append(table)
+
+        cls = cls or TableGroup
+        res = cls.fromvalue(md)
+        res._fname = self.dir / 'csvw-metadata.json'
+        return res
@@ -0,0 +1,190 @@
+import re
+import json
+import math
+import typing
+import decimal
+import pathlib
+import datetime
+import collections
+
+import attr
+from rdflib import Graph, URIRef, Literal
+from rfc3986 import URIReference
+from isodate.duration import Duration
+
+from .utils import is_url
+
+__all__ = ['group_triples', 'to_json', 'Triple', 'format_value']
+
+
+def format_value(value, col):
+    """
+    Format values as JSON-LD literals.
+    """
+    if isinstance(value, (datetime.date, datetime.datetime, datetime.time)):
+        res = value.isoformat()
+        if col and col.datatype.base == 'time':
+            res = res.split('T')[-1]
+        if col and col.datatype.base == 'date':
+            res = re.sub('T[0-9.:]+', '', res)
+        if isinstance(value, (datetime.datetime, datetime.time)):
+            stamp, _, milliseconds = res.partition('.')
+            return '{}.{}'.format(stamp, milliseconds.rstrip('0')) if milliseconds \
+                else stamp.replace('+00:00', 'Z')
+        return res  # pragma: no cover
+    if isinstance(value, datetime.timedelta):
+        return col.datatype.formatted(value)
+    if isinstance(value, Duration):
+        return col.datatype.formatted(value)
+    if isinstance(value, decimal.Decimal):
+        value = float(value)
+    if isinstance(value, URIReference):
+        return value.unsplit()
+    if isinstance(value, bytes):
+        return col.datatype.formatted(value)
+    if isinstance(value, pathlib.Path):
+        return str(value)
+    if isinstance(value, float):
+        return 'NaN' if math.isnan(value) else (
+            '{}INF'.format('-' if value < 0 else '') if math.isinf(value) else value)
+    return value
+
+
+@attr.s
+class Triple:
+    """
+    A table cell's data as RDF triple.
+    """
+    about = attr.ib()
+    property = attr.ib()
+    value = attr.ib()
+
+    def as_rdflib_triple(self):
+        return (
+            URIRef(self.about),
+            URIRef(self.property),
+            URIRef(self.value) if is_url(self.value) else Literal(self.value))
+
+    @classmethod
+    def from_col(cls, table, col, row, prop, val, rownum):
+        """
+
+        """
+        _name = col.header if col else None
+
+        propertyUrl = col.propertyUrl if col else table.inherit('propertyUrl')
+        if propertyUrl:
+            prop = table.expand(propertyUrl, row, _row=rownum, _name=_name, qname=True)
+
+        is_type = prop == 'rdf:type'
+        valueUrl = col.valueUrl if col else table.inherit('valueUrl')
+        if valueUrl:
+            val = table.expand(
+                valueUrl, row, _row=rownum, _name=_name, qname=is_type, uri=not is_type)
+        val = format_value(val, col)
+        s = None
+        aboutUrl = col.aboutUrl if col else None
+        if aboutUrl:
+            s = table.expand(aboutUrl, row, _row=rownum, _name=_name) or s
+        return cls(about=s, property=prop, value=val)
+
+
+def frame(data: list) -> list:
+    """
+    Inline referenced items to force a deterministic graph layout.
+
+    .. see:: https://w3c.github.io/json-ld-framing/#introduction
+    """
+    items, refs = collections.OrderedDict(), {}
+    for item in data:
+        itemid = item.get('@id')
+        if itemid:
+            items[itemid] = item
+        for vs in item.values():
+            for v in [vs] if not isinstance(vs, list) else vs:
+                if isinstance(v, dict):
+                    refid = v.get('@id')
+                    if refid:
+                        refs.setdefault(refid, (v, []))[1].append(item)
+    for ref, subjects in refs.values():
+        if len(subjects) == 1 and ref['@id'] in items:
+            ref.update(items.pop(ref['@id']))
+    return list(items.values())
+
+
+def to_json(obj, flatten_list=False):
+    """
+    Simplify JSON-LD data by refactoring trivial objects.
+    """
+    if isinstance(obj, dict):
+        if '@value' in obj:
+            obj = obj['@value']
+        if len(obj) == 1 and '@id' in obj:
+            obj = obj['@id']
+    if isinstance(obj, dict):
+        return {
+            '@type' if k == 'rdf:type' else k: to_json(v, flatten_list=flatten_list)
+            for k, v in obj.items()}
+    if isinstance(obj, list):
+        if len(obj) == 1 and flatten_list:
+            return to_json(obj[0], flatten_list=flatten_list)
+        return [to_json(v, flatten_list=flatten_list) for v in obj]
+    return obj
+
+
+def group_triples(triples: typing.Iterable[Triple]) -> typing.List[dict]:
+    """
+    Group and frame triples into a `list` of JSON objects.
+    """
+    merged = []
+    for triple in triples:
+        if isinstance(triple.value, list):
+            for t in merged:
+                if t.property == triple.property and isinstance(t.value, list):
+                    t.value.extend(triple.value)
+                    break
+            else:
+                merged.append(triple)
+        else:
+            merged.append(triple)
+
+    grouped = collections.OrderedDict()
+    triples = []
+    # First pass: get top-level properties.
+    for triple in merged:
+        if triple.about is None and triple.property == '@id':
+            grouped[triple.property] = triple.value
+        else:
+            if not triple.about:
+                # For test48
+                if triple.property in grouped:
+                    if not isinstance(grouped[triple.property], list):
+                        grouped[triple.property] = [grouped[triple.property]]
+                    grouped[triple.property].append(triple.value)
+                else:
+                    grouped[triple.property] = triple.value
+            else:
+                triples.append(triple)
+    if not triples:
+        return [grouped]
+
+    g = Graph()
+    for triple in triples:
+        g.add(triple.as_rdflib_triple())
+    if '@id' in grouped:
+        for prop, val in grouped.items():
+            if prop != '@id':
+                g.add(Triple(about=grouped['@id'], property=prop, value=val).as_rdflib_triple())
+    res = g.serialize(format='json-ld')
+    # Frame and simplify the resulting objects, augment with list index:
+    res = [(i, to_json(v, flatten_list=True)) for i, v in enumerate(frame(json.loads(res)))]
+    # Sort the objects making sure the one with the row's aboutUrl as @id comes first:
+    res = [k[1] for k in sorted(
+        res, key=lambda o: -1 if o[1].get('@id') == grouped.get('@id') else o[0])]
+    # If there's no aboutUrl for the row and we have only one object from triples, we just merge
+    # the properties into a single object.
+    if grouped and ('@id' not in grouped) and len(res) == 1:
+        grouped.update(res[0])
+        return [grouped]
+
+    return res
@@ -0,0 +1,230 @@
+import re
+import copy
+import html
+import json
+import string
+import keyword
+import pathlib
+import warnings
+import collections
+import unicodedata
+
+import attr
+
+
+def is_url(s):
+    return re.match(r'https?://', str(s))
+
+
+def converter(type_, default, s, allow_none=False, cond=None, allow_list=True):
+    if allow_list and type_ != list and isinstance(s, list):
+        return [v for v in [converter(type_, None, ss, cond=cond) for ss in s] if v is not None]
+
+    if allow_none and s is None:
+        return s
+    if not isinstance(s, type_) or (type_ == int and isinstance(s, bool)) or (cond and not cond(s)):
+        warnings.warn('Invalid value for property: {}'.format(s))
+        return default
+    return s
+
+
+def ensure_path(fname):
+    if not isinstance(fname, pathlib.Path):
+        assert isinstance(fname, str)
+        return pathlib.Path(fname)
+    return fname
+
+
+def attr_defaults(cls):
+    res = collections.OrderedDict()
+    for field in attr.fields(cls):
+        default = field.default
+        if isinstance(default, attr.Factory):
+            default = default.factory()
+        res[field.name] = default
+    return res
+
+
+def attr_asdict(obj, omit_defaults=True, omit_private=True):
+    defs = attr_defaults(obj.__class__)
+    res = collections.OrderedDict()
+    for field in attr.fields(obj.__class__):
+        if not (omit_private and field.name.startswith('_')):
+            value = getattr(obj, field.name)
+            if not (omit_defaults and value == defs[field.name]):
+                if hasattr(value, 'asdict'):
+                    value = value.asdict(omit_defaults=True)
+                res[field.name] = value
+    return res
+
+
+def normalize_name(s):
+    """Convert a string into a valid python attribute name.
+    This function is called to convert ASCII strings to something that can pass as
+    python attribute name, to be used with namedtuples.
+
+    >>> str(normalize_name('class'))
+    'class_'
+    >>> str(normalize_name('a-name'))
+    'a_name'
+    >>> str(normalize_name('a n\u00e4me'))
+    'a_name'
+    >>> str(normalize_name('Name'))
+    'Name'
+    >>> str(normalize_name(''))
+    '_'
+    >>> str(normalize_name('1'))
+    '_1'
+    """
+    s = s.replace('-', '_').replace('.', '_').replace(' ', '_')
+    if s in keyword.kwlist:
+        return s + '_'
+    s = '_'.join(slug(ss, lowercase=False) for ss in s.split('_'))
+    if not s:
+        s = '_'
+    if s[0] not in string.ascii_letters + '_':
+        s = '_' + s
+    return s
+
+
+def slug(s, remove_whitespace=True, lowercase=True):
+    """Condensed version of s, containing only lowercase alphanumeric characters.
+
+    >>> str(slug('A B. \u00e4C'))
+    'abac'
+    """
+    res = ''.join(c for c in unicodedata.normalize('NFD', s)
+                  if unicodedata.category(c) != 'Mn')
+    if lowercase:
+        res = res.lower()
+    for c in string.punctuation:
+        res = res.replace(c, '')
+    res = re.sub(r'\s+', '' if remove_whitespace else ' ', res)
+    res = res.encode('ascii', 'ignore').decode('ascii')
+    assert re.match('[ A-Za-z0-9]*$', res)
+    return res
+
+
+def qname2url(qname):
+    for prefix, uri in {
+        'csvw': 'http://www.w3.org/ns/csvw#',
+        'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+        'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
+        'xsd': 'http://www.w3.org/2001/XMLSchema#',
+        'dc': 'http://purl.org/dc/terms/',
+        'dcat': 'http://www.w3.org/ns/dcat#',
+        'prov': 'http://www.w3.org/ns/prov#',
+    }.items():
+        if qname.startswith(prefix + ':'):
+            return qname.replace(prefix + ':', uri)
+
+
+def metadata2markdown(tg, link_files=False) -> str:
+    """
+    Render the metadata of a dataset as markdown.
+
+    :param link_files: If True, links to data files will be added, assuming the markdown is stored \
+    in the same directory as the metadata file.
+    :return: `str` with markdown formatted text
+    """
+    def qname2link(qname, html=False):
+        url = qname2url(qname)
+        if url:
+            if html:
+                return '<a href="{}">{}</a>'.format(url, qname)
+            return '[{}]({})'.format(qname, url)
+        return qname
+
+    def htmlify(obj, key=None):
+        """
+        For inclusion in tables we must use HTML for lists.
+        """
+        if isinstance(obj, list):
+            return '<ol>{}</ol>'.format(
+                ''.join('<li>{}</li>'.format(htmlify(item, key=key)) for item in obj))
+        if isinstance(obj, dict):
+            items = []
+            for k, v in obj.items():
+                items.append('<dt>{}</dt><dd>{}</dd>'.format(
+                    qname2link(k, html=True), html.escape(str(v))))
+            return '<dl>{}</dl>'.format(''.join(items))
+        return str(obj)
+
+    def properties(props):
+        props = {k: v for k, v in copy.deepcopy(props).items() if v}
+        res = []
+        desc = props.pop('dc:description', None)
+        if desc:
+            res.append(desc + '\n')
+        img = props.pop('https://schema.org/image', None)
+        if img:
+            if isinstance(img, str):  # pragma: no cover
+                img = {'contentUrl': img}
+            res.append('![{}]({})\n'.format(
+                img.get('https://schema.org/caption') or '',
+                img.get('https://schema.org/contentUrl')))
+        if props:
+            res.append('property | value\n --- | ---')
+            for k, v in props.items():
+                res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k)))
+        return '\n'.join(res) + '\n'
+
+    def colrow(col, fks, pk):
+        dt = '`{}`'.format(col.datatype.base if col.datatype else 'string')
+        if col.datatype:
+            if col.datatype.format:
+                if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format):
+                    dt += '<br>Valid choices:<br>'
+                    dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|'))
+                elif col.datatype.base == 'string':
+                    dt += '<br>Regex: `{}`'.format(col.datatype.format)
+            if col.datatype.minimum:
+                dt += '<br>&ge; {}'.format(col.datatype.minimum)
+            if col.datatype.maximum:
+                dt += '<br>&le; {}'.format(col.datatype.maximum)
+        if col.separator:
+            dt = 'list of {} (separated by `{}`)'.format(dt, col.separator)
+        desc = col.common_props.get('dc:description', '').replace('\n', ' ')
+
+        if pk and col.name in pk:
+            desc = (desc + '<br>') if desc else desc
+            desc += 'Primary key'
+
+        if col.name in fks:
+            desc = (desc + '<br>') if desc else desc
+            desc += 'References [{}::{}](#table-{})'.format(
+                fks[col.name][1], fks[col.name][0], slug(fks[col.name][1]))
+
+        return ' | '.join([
+            '[{}]({})'.format(col.name, col.propertyUrl)
+            if col.propertyUrl else '`{}`'.format(col.name),
+            dt,
+            desc,
+        ])
+
+    res = ['# {}\n'.format(tg.common_props.get('dc:title', 'Dataset'))]
+    if tg._fname and link_files:
+        res.append('> [!NOTE]\n> Described by [{0}]({0}).\n'.format(tg._fname.name))
+
+    res.append(properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'}))
+
+    for table in tg.tables:
+        fks = {
+            fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string)
+            for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1}
+        header = '## <a name="table-{}"></a>Table '.format(slug(table.url.string))
+        if link_files and tg._fname and tg._fname.parent.joinpath(table.url.string).exists():
+            header += '[{0}]({0})\n'.format(table.url.string)
+        else:  # pragma: no cover
+            header += table.url.string
+        res.append('\n' + header + '\n')
+        res.append(properties(table.common_props))
+        dialect = table.inherit('dialect')
+        if dialect.asdict():
+            res.append('\n**CSV dialect**: `{}`\n'.format(json.dumps(dialect.asdict())))
+        res.append('\n### Columns\n')
+        res.append('Name/Property | Datatype | Description')
+        res.append(' --- | --- | --- ')
+        for col in table.tableSchema.columns:
+            res.append(colrow(col, fks, table.tableSchema.primaryKey))
+    return '\n'.join(res)