Source code for mira.dkg.construct_registry

"""Constants for the MIRA Metaregistry."""

import csv
import gzip
import itertools as itt
import json
from collections import ChainMap
from pathlib import Path
from typing import Optional, Set

import bioregistry
import click
from bioregistry import Manager
from tqdm import tqdm

from mira.dkg.construct import METAREGISTRY_PATH, upload_s3, UseCasePaths
from mira.dkg.models import Config

HERE = Path(__file__).parent.resolve()
EPI_CONF_PATH = HERE.joinpath("metaregistry", "epi.json")

COLLECTIONS = {
    "0000007",  # publishing
    "0000008",  # ASKEM custom list, see https://bioregistry.io/collection/0000008
}

EPI_USE_CASE = UseCasePaths("epi")
NODES_PATH = EPI_USE_CASE.NODES_PATH
EDGES_PATH = EPI_USE_CASE.EDGES_PATH


[docs]def get_prefixes(
    *, nodes_path: Optional[Path] = None, edges_path: Optional[Path] = None
) -> Set[str]:
    """Get the prefixes to use for the slim."""
    bioregistry_prefixes = {
        resource.prefix for resource in bioregistry.resources() if "bioregistry" in resource.prefix
    }

    collection_prefixes = {
        prefix
        for collection_id in COLLECTIONS
        for prefix in bioregistry.get_collection(collection_id).resources
    }

    return set(
        itt.chain(
            get_dkg_prefixes(nodes_path=nodes_path, edges_path=edges_path),
            bioregistry_prefixes,
            collection_prefixes,
        )
    )


def get_dkg_prefixes(
    nodes_path: Optional[Path] = None, edges_path: Optional[Path] = None
) -> Set[str]:
    prefixes: Set[str] = set()

    with gzip.open(nodes_path or NODES_PATH, "rt") as file:
        reader = csv.reader(file, delimiter="\t")
        _header = next(reader)
        it = tqdm(reader, unit="node", unit_scale=True)
        for (
            curie,
            _label,
            _name,
            _synoynms,
            _obsolete,
            _type,
            _description,
            xrefs,
            _alts,
            _version,
            _property_predicates,
            _property_values,
            _xref_types,
            _synonym_types,
            _sources,
        ) in it:
            if not curie or curie.startswith("_:geni"):
                continue
            prefix, identifier = curie.split(":", 1)
            prefixes.add(prefix)
            for xref in xrefs.split(";"):
                if xref:
                    prefixes.add(xref.split(":", 1)[0])

    with gzip.open(edges_path or EDGES_PATH, "rt") as file:
        reader = csv.reader(file, delimiter="\t")
        _header = next(reader)
        it = tqdm(reader, unit="edge", unit_scale=True)
        for s, o, _type, p, _source, _graph, _version in it:
            if s.startswith("http"):
                continue  # skip unnormalized
    return prefixes


@click.command()
@click.option("--config-path", type=Path, default=EPI_CONF_PATH)
@click.option("--output-path", type=Path, default=METAREGISTRY_PATH)
@click.option("--nodes-path", type=Path, default=NODES_PATH)
@click.option("--edges-path", type=Path, default=EDGES_PATH)
@click.option("--upload", is_flag=True)
def main(config_path, output_path, nodes_path, edges_path, upload: bool):
    _construct_registry(
        config_path=config_path,
        output_path=output_path,
        nodes_path=nodes_path,
        edges_path=edges_path,
        upload=upload,
    )


def _construct_registry(
    *,
    config_path: Path,
    output_path: Path,
    nodes_path: Optional[Path] = None,
    edges_path: Optional[Path] = None,
    upload: bool = False,
):
    config = Config.parse_file(config_path)

    prefixes = get_prefixes(nodes_path=nodes_path, edges_path=edges_path)
    manager = Manager(
        registry=ChainMap(
            dict(config.registry),
            {
                resource.prefix: resource
                for resource in bioregistry.resources()
                if resource.prefix in prefixes
            },
        )
    )
    new_config = Config(
        web=config.web,
        registry=manager._rasterized_registry(),
        collections=config.collections,
    )

    output_path.write_text(
        json.dumps(new_config.dict(exclude_none=True, exclude_unset=True), indent=2)
    )
    if upload:
        upload_s3(output_path, use_case="epi")


if __name__ == "__main__":
    main()