Source code for mira.sources.biomodels

"""
The BioModels database lists several high quality
models at https://www.ebi.ac.uk/biomodels/covid-19.
"""
import io
import zipfile
from typing import Dict, List

import pystow
import requests
from tabulate import tabulate
from tqdm import tqdm

from mira.metamodel import TemplateModel
from mira.modeling.viz import GraphicalModel
from mira.sources.sbml import (
    template_model_from_sbml_file_obj,
    template_model_from_sbml_string,
)

MODULE = pystow.module("mira")
BIOMODELS = MODULE.module("biomodels")

SEARCH_URL = "https://www.ebi.ac.uk/biomodels/search"
DOWNLOAD_URL = "https://www.ebi.ac.uk/biomodels/search/download"

SPECIES_BLACKLIST = {
    "BIOMD0000000991": ["detected_cumulative"],
    "BIOMD0000000957": ["Confirmed"],
    "BIOMD0000000960": ["Cumulative_Cases"],
    # "BIOMD0000000970": ["Total_Population"],
}

#: Additional model identifiers for epidemiology models that
#: do not appear in the BioModels curated list of COVID-19 models
NON_COVID_EPI_MODELS = {
    "BIOMD0000000715",  # SEIS epidemic model with the impact of media
    "BIOMD0000001045",  # hong kong flu
    "MODEL1805220001",  # Human/Mosquito SEIR/SEI Mode
    "MODEL1805230001",  # Model for HIV-Malaria co-infection
    "MODEL1808280006",  # SIRWS model with immune boosting and cross-immunity between two pathogens
    "MODEL1008060002",  # zombie infection toy model (lol)
    "BIOMD0000000922",
    "BIOMD0000000726",
    "BIOMD0000000249",
    "BIOMD0000000294",
    "BIOMD0000000716",
    "BIOMD0000000717",
    "MODEL1008060000",
    "MODEL2212310001",
    "MODEL1808280011",
    "BIOMD0000000950",
    "BIOMD0000000949",
}
MODEL_BLACKLIST = {
    "MODEL2209020001",  # Trash BEL model from Fraunhofer
    "MODEL2003020001",  # only has OMEX data
}
#: Annotation of missing pubmeds to model ids
MODEL_TO_PUBMED = {
    "BIOMD0000000716": "30839942",
    "BIOMD0000000717": "30839942",
}


[docs]def query_biomodels(
    query: str = "submitter_keywords:COVID-19",
    limit: int = 30,
) -> List[Dict[str, str]]:
    """Query and paginate over results from the BioModels API.

    .. seealso:: https://www.ebi.ac.uk/biomodels/docs/

    Parameters
    ----------
    query :
        The query string to search for. Defaults to
        "submitter_keywords:COVID-19".
    limit :
        The maximum number of results to return. Defaults to 30.

    Returns
    -------
    :
        A list of model metadata dictionaries.
    """
    model_ids = set()
    res = requests.get(
        SEARCH_URL,
        headers={"Accept": "application/json"},
        params={
            "query": query,
            "domain": "biomodels",
            "numResults": limit,
        },
    ).json()
    model_ids.update(
        model['id']
        for model in res.pop("models")
    )
    model_ids.update(NON_COVID_EPI_MODELS)
    model_ids.difference_update(MODEL_BLACKLIST)

    # TODO extend with pagination at same time as making query configurable

    rv = []
    # Split titles that have the AuthorYYYY - Title format
    for model_id in model_ids:
        model = {"biomodels_id": model_id}
        model_metadata = requests.get(
            f"https://www.ebi.ac.uk/biomodels/{model_id}",
            headers={"Accept": "application/json"},
        ).json()
        publication_link = model_metadata.get("publication", {}).get("link")
        if publication_link:
            if model_id in MODEL_TO_PUBMED:
                model["pubmed"] = MODEL_TO_PUBMED[model_id]
            elif "identifiers.org/pubmed/" in publication_link:
                model["pubmed"] = publication_link.split("/")[-1]
            elif publication_link.startswith("http://identifiers.org/doi/"):
                model["doi"] = publication_link[len("http://identifiers.org/doi/"):]
            elif publication_link.startswith("https://doi.org/"):
                model["doi"] = publication_link[len("https://doi.org/"):]
            else:
                tqdm.write(f"[{model_id}] unhandled publication link: {publication_link}")
        model_name = model_metadata.get("name")
        if model_name == model_id:
            continue
        try:
            model_author, model_name = (s.strip() for s in model_name.split(" - ", 1))
        except ValueError:
            model["name"] = model_name
            continue
        else:
            model["name"] = model_name
            model["author"] = model_author[:-4]
            model["year"] = model_author[-4:]
        rv.append(model)
    return rv


[docs]def get_sbml_model(model_id: str) -> str:
    """Return the SBML string content for a BioModels model from the web.

    Parameters
    ----------
    model_id :
        The BioModels ID of the model.

    Returns
    -------
    :
        The SBML XML string corresponding to the model.
    """
    url = f'{DOWNLOAD_URL}?models={model_id}'
    res = requests.get(url)
    if res.status_code == 404:
        raise FileNotFoundError(f'No such file on source server: {model_id}')
    z = zipfile.ZipFile(io.BytesIO(res.content))
    return z.open(f'{model_id}.xml').read().decode('utf-8')


[docs]def get_template_model(model_id: str) -> TemplateModel:
    """Return the Template Model processed from a BioModels model from the web.

    Parameters
    ----------
    model_id :
        The BioModels ID of the model.

    Returns
    -------
    :
        The Template model corresponding to the BioModels model.
    """
    sbml_xml = get_sbml_model(model_id)
    template_model = template_model_from_sbml_string(sbml_xml)
    return template_model


def main():
    """Iterate over COVID-19 models and parse them."""
    import pandas as pd
    from modeling.triples import TriplesGenerator

    triples_path = BIOMODELS.join(name="triples.tsv")
    query_path = BIOMODELS.join(name="query.tsv")
    if query_path.is_file():
        df = pd.read_csv(query_path, sep="\t")
    else:
        models = query_biomodels("submitter_keywords:COVID-19", limit=30)
        df = pd.DataFrame(models).sort_values(["year", "author", "name"]).reset_index()
        df = df[["biomodels_id", "name", "author", "year", "pubmed", "doi"]]
        df.to_csv(query_path, sep="\t", index=False)

    rows = []
    dataframes = []
    for model_id, model_name, model_author, model_year, pubmed, doi in tqdm(
        df.values, desc="Converting", unit="model"
    ):
        model_module = BIOMODELS.module("models", model_id)
        url = f"{DOWNLOAD_URL}?models={model_id}"
        with model_module.ensure_open_zip(
            url=url, name=f"{model_id}.zip", inner_path=f"{model_id}.xml"
        ) as file:
            try:
                template_model = template_model_from_sbml_file_obj(
                    file, model_id=model_id, reporter_ids=SPECIES_BLACKLIST.get(model_id)
                )
            except Exception as e:
                tqdm.write(f"[{model_id}] failed to parse: {e}")
                continue
        model_module.join(name=f"{model_id}.json").write_text(
            template_model.json(indent=2)
        )

        # Write a petri-net type graphical representation of the model
        m = GraphicalModel.from_template_model(template_model)
        m.graph.graph_attr[
            "label"
        ] = f"{model_name}\n{model_id}\n{model_author}, {model_year}"
        m.write(model_module.join(name=f"{model_id}.png"))
        m.write(BIOMODELS.join("images", name=f"{model_id}.png"))

        m = TriplesGenerator(template_model, skip_prefixes=["biomodel.species"])
        triples_df = m.to_dataframe()
        triples_df["model"] = model_id
        dataframes.append(triples_df)

        rows.append(
            (
                model_id,
                model_name,
                len(template_model.templates),
                ", ".join(sorted({t.type for t in template_model.templates})),
            )
        )

    cat_triples_df = pd.concat(dataframes)
    cat_triples_df.to_csv(triples_path, sep="\t", index=False)

    summary_columns = ["model_id", "name", "# templates", "template_types"]
    summary_df = pd.DataFrame(rows, columns=summary_columns).sort_values(
        "# templates", ascending=False
    )
    print(tabulate(summary_df, headers=summary_df.columns, showindex=False))


if __name__ == "__main__":
    main()