From b1bac5864d8a6b6193c8a648ad43222e3f3b3a12 Mon Sep 17 00:00:00 2001 From: Daniel Bolin Date: Fri, 9 Feb 2024 16:04:21 -0500 Subject: [PATCH] Improve crosswalking and algorithm metadata --- .../azimuth/context/annotation-levels.json | 9 -- containers/azimuth/context/download-data.sh | 4 +- .../azimuth/context/download_reference_data.R | 6 +- containers/azimuth/context/main.py | 65 ++++++------ containers/azimuth/context/organ-mapping.json | 14 --- .../azimuth/context/organ-metadata.json | 62 ++++++++++++ containers/celltypist/context/main.py | 41 ++++---- .../celltypist/context/organ-mapping.json | 9 -- .../celltypist/context/organ-metadata.json | 23 +++++ containers/crosswalking/context/main.py | 85 +++++++++++----- containers/crosswalking/options.yml | 12 ++- containers/popv/context/download-ontology.sh | 2 +- containers/popv/context/main.py | 44 ++++++--- containers/popv/context/organ-mapping.json | 26 ----- containers/popv/context/organ-metadata.json | 98 +++++++++++++++++++ src/algorithm/__init__.py | 5 +- src/algorithm/algorithm.py | 97 +++++++++++------- src/algorithm/arguments.py | 6 +- src/algorithm/organ.py | 80 --------------- steps/check_annotation_report.cwl | 2 +- 20 files changed, 409 insertions(+), 281 deletions(-) delete mode 100644 containers/azimuth/context/annotation-levels.json delete mode 100644 containers/azimuth/context/organ-mapping.json create mode 100644 containers/azimuth/context/organ-metadata.json delete mode 100644 containers/celltypist/context/organ-mapping.json create mode 100644 containers/celltypist/context/organ-metadata.json delete mode 100644 containers/popv/context/organ-mapping.json create mode 100644 containers/popv/context/organ-metadata.json delete mode 100644 src/algorithm/organ.py diff --git a/containers/azimuth/context/annotation-levels.json b/containers/azimuth/context/annotation-levels.json deleted file mode 100644 index 387d385..0000000 --- a/containers/azimuth/context/annotation-levels.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "kidneyref": "annotation.l3", - "lungref": "ann_finest_level", - "heartref": "celltype.l2", - "humancortexref": "subclass", - "pancreasref": "annotation.l1", - "tonsilref": "celltype.l2", - "pbmcref": "celltype.l2" -} diff --git a/containers/azimuth/context/download-data.sh b/containers/azimuth/context/download-data.sh index ff9c5ba..faa6acc 100644 --- a/containers/azimuth/context/download-data.sh +++ b/containers/azimuth/context/download-data.sh @@ -2,8 +2,8 @@ set -e OUTPUT_DIR=${1:-"./azimuth"} -MAPPING_FILE=${2:-"/organ-mapping.json"} +METADATA_FILE=${2:-"/organ-metadata.json"} export R_LIBS="$OUTPUT_DIR" mkdir -p "$OUTPUT_DIR" -Rscript /download_reference_data.R "$MAPPING_FILE" "$OUTPUT_DIR" +Rscript /download_reference_data.R "$METADATA_FILE" "$OUTPUT_DIR" diff --git a/containers/azimuth/context/download_reference_data.R b/containers/azimuth/context/download_reference_data.R index cbc6504..ee27eab 100644 --- a/containers/azimuth/context/download_reference_data.R +++ b/containers/azimuth/context/download_reference_data.R @@ -3,12 +3,12 @@ library(Seurat) library(SeuratData) args <- commandArgs(trailingOnly = TRUE) -organ_mapping_file <- args[1] +organ_metadata_file <- args[1] output_dir <- args[2] # Load unique reference organs -mapping <- fromJSON(file = organ_mapping_file) -references <- unlist(unique(mapping)) +metadata <- fromJSON(file = organ_metadata_file) +references <- unique(sapply(metadata, function(item) item$model)) # Download and install data options(timeout=60 * 60) # Probably overkill but the default of 60s is to low for some of the datasets diff --git a/containers/azimuth/context/main.py b/containers/azimuth/context/main.py index 1100a33..2fc5a65 100644 --- a/containers/azimuth/context/main.py +++ b/containers/azimuth/context/main.py @@ -1,4 +1,3 @@ -import json import logging import subprocess import typing as t @@ -7,24 +6,31 @@ import anndata import pandas -from src.algorithm import Algorithm, OrganLookup, add_common_arguments +from src.algorithm import Algorithm, RunResult, add_common_arguments + + +class AzimuthOrganMetadata(t.TypedDict): + model: str + organ_level: str + prediction_column: str class AzimuthOptions(t.TypedDict): reference_data_dir: Path - annotation_levels: Path - -class AzimuthAlgorithm(Algorithm[str, AzimuthOptions]): - def __init__(self): - super().__init__(OrganLookup) - def do_run(self, matrix: Path, organ: str, options: AzimuthOptions): +class AzimuthAlgorithm(Algorithm[AzimuthOrganMetadata, AzimuthOptions]): + def do_run( + self, + matrix: Path, + organ: str, + metadata: AzimuthOrganMetadata, + options: AzimuthOptions, + ) -> RunResult: """Annotate data using azimuth.""" data = anndata.read_h5ad(matrix) - reference_data = self.find_reference_data(organ, options["reference_data_dir"]) - annotation_level = self.find_annotation_level( - organ, options["annotation_levels"] + reference_data = self.find_reference_data( + organ, metadata["model"], options["reference_data_dir"] ) # Azimuth chokes when trying to load matrices that has @@ -41,7 +47,11 @@ def do_run(self, matrix: Path, organ: str, options: AzimuthOptions): annotated_matrix = anndata.read_h5ad(annotated_matrix_path) self.copy_annotations(data, annotated_matrix) - return data, annotation_level + return { + "data": data, + "organ_level": metadata["organ_level"], + "prediction_column": "predicted." + metadata["prediction_column"], + } def create_clean_matrix(self, matrix: anndata.AnnData) -> anndata.AnnData: """Creates a copy of the data with all observation columns removed. @@ -73,7 +83,7 @@ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str: Args: matrix_path (Path): Path to data file - reference_data (Path): Path to organ reference data directory + reference_data (Path): Path to model reference data directory Returns: str: Path to the output data file @@ -82,11 +92,12 @@ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str: subprocess.run(script_command, capture_output=True, check=True, text=True) return "./result.h5ad" - def find_reference_data(self, organ: str, dir: Path) -> Path: - """Finds the reference data directory for an organ. + def find_reference_data(self, organ: str, model: str, dir: Path) -> Path: + """Finds the reference data directory for a model. Args: - organ (str): Organ name + organ (str): Organ id + model (str): Model name dir (Path): Directory to search Raises: @@ -97,7 +108,7 @@ def find_reference_data(self, organ: str, dir: Path) -> Path: """ def is_reference_data_candidate(path: Path): - return path.is_dir() and organ.lower() in path.name.lower() + return path.is_dir() and model.lower() in path.name.lower() subdir = self._find_in_dir( dir, @@ -108,20 +119,6 @@ def is_reference_data_candidate(path: Path): # idx.annoy and ref.Rds is always located inside an 'azimuth' subdirectory return subdir / "azimuth" - def find_annotation_level(self, organ: str, path: Path) -> str: - """Finds the column name which contains the predictions. - - Args: - organ (str): Organ name - path (Path): Path to file containing information about column names - - Returns: - str: Column name - """ - with open(path) as file: - levels_by_organ = json.load(file) - return "predicted." + levels_by_organ[organ] - def _find_in_dir( self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str ) -> Path: @@ -159,12 +156,6 @@ def _get_arg_parser(): required=True, help="Path to directory with reference data", ) - parser.add_argument( - "--annotation-levels", - type=Path, - default="/annotation-levels.json", - help="Json file with organ to annotation levels", - ) return parser diff --git a/containers/azimuth/context/organ-mapping.json b/containers/azimuth/context/organ-mapping.json deleted file mode 100644 index daaa5ab..0000000 --- a/containers/azimuth/context/organ-mapping.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "UBERON:0002113": "kidneyref", - "UBERON:0004538": "kidneyref", - "UBERON:0004539": "kidneyref", - "UBERON:0002048": "lungref", - "UBERON:0001004": "lungref", - "UBERON:0000948": "heartref", - "UBERON:0000955": "humancortexref", - "UBERON:0001264": "pancreasref", - "UBERON:0002373": "tonsilref", - "UBERON:0000178": "pbmcref", - "UBERON:0002371": "bonemarrowref", - "UBERON:0001013": "adiposeref" -} diff --git a/containers/azimuth/context/organ-metadata.json b/containers/azimuth/context/organ-metadata.json new file mode 100644 index 0000000..833f834 --- /dev/null +++ b/containers/azimuth/context/organ-metadata.json @@ -0,0 +1,62 @@ +{ + "UBERON:0002113": { + "model": "kidneyref", + "organ_level": "Kidney_L3", + "prediction_column": "annotation.l3" + }, + "UBERON:0004538": { + "model": "kidneyref", + "organ_level": "Kidney_L3", + "prediction_column": "annotation.l3" + }, + "UBERON:0004539": { + "model": "kidneyref", + "organ_level": "Kidney_L3", + "prediction_column": "annotation.l3" + }, + "UBERON:0002048": { + "model": "lungref", + "organ_level": "Lung_v2_finest_level", + "prediction_column": "ann_finest_level" + }, + "UBERON:0001004": { + "model": "lungref", + "organ_level": "Lung_v2_finest_level", + "prediction_column": "ann_finest_level" + }, + "UBERON:0000948": { + "model": "heartref", + "organ_level": "Heart_L2", + "prediction_column": "celltype.l2" + }, + "UBERON:0000955": { + "model": "humancortexref", + "organ_level": "", + "prediction_column": "subclass" + }, + "UBERON:0001264": { + "model": "pancreasref", + "organ_level": "Pancreas_L1", + "prediction_column": "annotation.l1" + }, + "UBERON:0002373": { + "model": "tonsilref", + "organ_level": "", + "prediction_column": "celltype.l2" + }, + "UBERON:0000178": { + "model": "pbmcref", + "organ_level": "Human_PBMC_L2", + "prediction_column": "celltype.l2" + }, + "UBERON:0002371": { + "model": "bonemarrowref", + "organ_level": "Bone_marrow_L2", + "prediction_column": "celltype.l2" + }, + "UBERON:0001013": { + "model": "adiposeref", + "organ_level": "", + "prediction_column": "celltype.l2" + } +} \ No newline at end of file diff --git a/containers/celltypist/context/main.py b/containers/celltypist/context/main.py index b72b4e9..0498deb 100644 --- a/containers/celltypist/context/main.py +++ b/containers/celltypist/context/main.py @@ -7,39 +7,38 @@ import pandas import scanpy -from src.algorithm import Algorithm, OrganLookup, add_common_arguments +from src.algorithm import Algorithm, RunResult, add_common_arguments -class CelltypistOptions(t.TypedDict): - ensemble_lookup: Path - +class CelltypistOrganMetadata(t.TypedDict): + model: str -class CelltypistOrganLookup(OrganLookup[celltypist.Model]): - def __init__(self, mapping_file: Path): - super().__init__(mapping_file) - def get_builtin_options(self): - """Get builtin celltypist models.""" - models = celltypist.models.get_all_models() - return map(lambda model: (model, self.from_raw(model)), models) - - def from_raw(self, id: str): - """Load a celltypist model.""" - return celltypist.models.Model.load(id) +class CelltypistOptions(t.TypedDict): + ensemble_lookup: Path -class CelltypistAlgorithm(Algorithm[celltypist.Model, CelltypistOptions]): +class CelltypistAlgorithm(Algorithm[CelltypistOrganMetadata, CelltypistOptions]): def __init__(self): - super().__init__(CelltypistOrganLookup, "predicted_labels") - - def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptions): + super().__init__("predicted_labels") + + def do_run( + self, + matrix: Path, + organ: str, + metadata: CelltypistOrganMetadata, + options: CelltypistOptions, + ) -> RunResult: """Annotate data using celltypist.""" data = scanpy.read_h5ad(matrix) data = self.normalize(data) data, var_names = self.normalize_var_names(data, options) - data = celltypist.annotate(data, organ, majority_voting=True).to_adata() + data = celltypist.annotate( + data, metadata["model"], majority_voting=True + ).to_adata() data.var_names = t.cast(t.Any, var_names) - return data + + return {"data": data, "organ_level": metadata["model"].replace(".", "_")} def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData: """Normalizes data according to celltypist requirements. diff --git a/containers/celltypist/context/organ-mapping.json b/containers/celltypist/context/organ-mapping.json deleted file mode 100644 index d2acba8..0000000 --- a/containers/celltypist/context/organ-mapping.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "UBERON:0002048": "Human_Lung_Atlas.pkl", - "UBERON:0001004": "Human_Lung_Atlas.pkl", - "UBERON:0002097": "Adult_Human_Skin.pkl", - "UBERON:0001264": "Adult_Human_PancreaticIslet.pkl", - "UBERON:0000948": "Healthy_Adult_Heart.pkl", - "UBERON:0002107": "Healthy_Human_Liver.pkl", - "UBERON:0000955": "Human_AdultAged_Hippocampus.pkl" -} diff --git a/containers/celltypist/context/organ-metadata.json b/containers/celltypist/context/organ-metadata.json new file mode 100644 index 0000000..db07d6d --- /dev/null +++ b/containers/celltypist/context/organ-metadata.json @@ -0,0 +1,23 @@ +{ + "UBERON:0002048": { + "model": "Human_Lung_Atlas.pkl" + }, + "UBERON:0001004": { + "model": "Human_Lung_Atlas.pkl" + }, + "UBERON:0002097": { + "model": "Adult_Human_Skin.pkl" + }, + "UBERON:0001264": { + "model": "Adult_Human_PancreaticIslet.pkl" + }, + "UBERON:0000948": { + "model": "Healthy_Adult_Heart.pkl" + }, + "UBERON:0002107": { + "model": "Healthy_Human_Liver.pkl" + }, + "UBERON:0000955": { + "model": "Human_AdultAged_Hippocampus.pkl" + } +} \ No newline at end of file diff --git a/containers/crosswalking/context/main.py b/containers/crosswalking/context/main.py index 87a61b7..8990f22 100644 --- a/containers/crosswalking/context/main.py +++ b/containers/crosswalking/context/main.py @@ -8,8 +8,15 @@ import pandas as pd -def filter_crosswalk_table(table: pd.DataFrame, *columns: str) -> pd.DataFrame: - """Filter the crosswalk table to only include specified columns. +def filter_crosswalk_table( + table: pd.DataFrame, + organ_id: str, + organ_level: str, + organ_id_column: str, + organ_level_column: str, + table_label_column: str, +) -> pd.DataFrame: + """Filter the crosswalk table to only include rows with organ id and level. Also removes empty rows and cast values to string. @@ -19,7 +26,12 @@ def filter_crosswalk_table(table: pd.DataFrame, *columns: str) -> pd.DataFrame: Returns: pd.DataFrame: Filtered table """ - return table[list(columns)].dropna().astype(str).drop_duplicates() + organ_id_rows = table[organ_id_column].str.lower() == organ_id.lower() + organ_level_rows = table[organ_level_column].str.lower() == organ_level.lower() + filtered_table = table[organ_id_rows & organ_level_rows] + normalized_table = filtered_table.dropna().astype(str) + unique_table = normalized_table.drop_duplicates(table_label_column) + return unique_table def generate_iri(label: str) -> str: @@ -39,10 +51,14 @@ def generate_iri(label: str) -> str: def crosswalk( matrix: anndata.AnnData, + organ_id: str, + organ_level: str, data_label_column: str, data_clid_column: str, data_match_column: str, table: pd.DataFrame, + table_organ_id_column: str, + table_organ_level_column: str, table_label_column: str, table_clid_column: str, table_match_column: str, @@ -67,7 +83,12 @@ def crosswalk( table_match_column: data_match_column, } table = filter_crosswalk_table( - table, table_label_column, table_clid_column, table_match_column + table, + organ_id, + organ_level, + table_organ_id_column, + table_organ_level_column, + table_label_column, ) merged_obs = ( matrix.obs.merge( @@ -119,6 +140,8 @@ def _get_empty_table(args: argparse.Namespace) -> pd.DataFrame: """ return pd.DataFrame( columns=[ + args.crosswalk_table_organ_id_column, + args.crosswalk_table_organ_level_column, args.crosswalk_table_label_column, args.crosswalk_table_clid_column, args.crosswalk_table_match_column, @@ -137,7 +160,7 @@ def _read_table(path: str) -> t.Optional[pd.DataFrame]: """ with open(path) as file: for row in csv.reader(file): - if row[0].lower() == 'organ_level': + if row[0].lower() == "organ_level": return pd.read_csv(file, names=row) return None @@ -149,18 +172,26 @@ def main(args: argparse.Namespace): args (argparse.Namespace): CLI arguments, must contain "matrix", "annotation_column", "clid_column", "match_column", - "crosswalk_table", "crosswalk_table_label_column", + "crosswalk_table", "crosswalk_table_organ_id_column", + "crosswalk_table_organ_level_column", "crosswalk_table_label_column", "crosswalk_table_clid_column", "crosswalk_table_match_column", and "output_matrix" """ + metadata = args.matrix.uns["hra_crosswalking"] matrix = crosswalk( args.matrix, + metadata["organ_id"], + metadata["organ_level"], args.annotation_column, args.clid_column, args.match_column, - args.crosswalk_table - if args.crosswalk_table is not None - else _get_empty_table(args), + ( + args.crosswalk_table + if args.crosswalk_table is not None + else _get_empty_table(args) + ), + args.crosswalk_table_organ_id_column, + args.crosswalk_table_organ_level_column, args.crosswalk_table_label_column, args.crosswalk_table_clid_column, args.crosswalk_table_match_column, @@ -171,35 +202,45 @@ def main(args: argparse.Namespace): def _get_arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Add crosswalking to h5ad data") parser.add_argument("matrix", type=anndata.read_h5ad, help="h5ad data file") + parser.add_argument( + "--annotation-column", default="hra_prediction", help="Column with annotations" + ) + parser.add_argument( + "--clid-column", default="clid", help="Output column for cell ids" + ) + parser.add_argument( + "--match-column", default="match_type", help="Output column for match" + ) parser.add_argument( "--crosswalk-table", type=_read_table, help="crosswalking csv file path", ) + parser.add_argument( + "--crosswalk-table-organ-id-column", + default="Organ_ID", + help="Column with organ ids in crosswalking table", + ) + parser.add_argument( + "--crosswalk-table-organ-level-column", + default="Organ_Level", + help="Column with organ levels in crosswalking table", + ) parser.add_argument( "--crosswalk-table-label-column", - default="label", - help="Column with Azimuth label in crosswalking table", + default="Annotation_Label", + help="Column with label in crosswalking table", ) parser.add_argument( "--crosswalk-table-clid-column", - default="clid", + default="CL_ID", help="Column with CL ID in crosswalking table", ) parser.add_argument( "--crosswalk-table-match-column", - default="match", + default="CL_Match", help="Column with match type in crosswalking table", ) - parser.add_argument( - "--annotation-column", default="hra_prediction", help="Column with annotations" - ) - parser.add_argument( - "--clid-column", default="clid", help="Output column for cell ids" - ) - parser.add_argument( - "--match-column", default="match_type", help="Output column for match" - ) parser.add_argument( "--output-matrix", type=Path, diff --git a/containers/crosswalking/options.yml b/containers/crosswalking/options.yml index eff5d6c..99a3418 100644 --- a/containers/crosswalking/options.yml +++ b/containers/crosswalking/options.yml @@ -7,9 +7,19 @@ fields: label: Crosswalk CSV file inputBinding: prefix: --crosswalk-table + tableOrganIdColumn: + type: string? + label: Organ id column in crosswalk table + inputBinding: + prefix: --crosswalk-table-organ-id-column + tableOrganLevelColumn: + type: string? + label: Organ level column in crosswalk table + inputBinding: + prefix: --crosswalk-table-organ-level-column tableLabelColumn: type: string? - label: Azimuth label column in crosswalk table + label: Label column in crosswalk table inputBinding: prefix: --crosswalk-table-label-column tableClidColumn: diff --git a/containers/popv/context/download-ontology.sh b/containers/popv/context/download-ontology.sh index e503a71..dbbd609 100755 --- a/containers/popv/context/download-ontology.sh +++ b/containers/popv/context/download-ontology.sh @@ -1,7 +1,7 @@ #!/bin/bash ONTOLOGY_DIR=./ontology -ONTOLOGY_DIR_URL=https://raw.githubusercontent.com/YosefLab/PopV/main/ontology +ONTOLOGY_DIR_URL=https://raw.githubusercontent.com/YosefLab/PopV/main/resources/ontology mkdir -p $ONTOLOGY_DIR cd $ONTOLOGY_DIR diff --git a/containers/popv/context/main.py b/containers/popv/context/main.py index 6626f08..4acef4b 100644 --- a/containers/popv/context/main.py +++ b/containers/popv/context/main.py @@ -9,7 +9,12 @@ import scanpy import torch -from src.algorithm import Algorithm, OrganLookup, add_common_arguments +from src.algorithm import Algorithm, RunResult, add_common_arguments + + +class PopvOrganMetadata(t.TypedDict): + model: str + organ_level: str class PopvOptions(t.TypedDict): @@ -27,48 +32,55 @@ class PopvOptions(t.TypedDict): ensemble_lookup: Path -class PopvAlgorithm(Algorithm[str, PopvOptions]): +class PopvAlgorithm(Algorithm[PopvOrganMetadata, PopvOptions]): def __init__(self): - super().__init__(OrganLookup, "popv_prediction") + super().__init__("popv_prediction") - def do_run(self, matrix: Path, organ: str, options: PopvOptions): + def do_run( + self, + matrix: Path, + organ: str, + metadata: PopvOrganMetadata, + options: PopvOptions, + ) -> RunResult: """Annotate data using popv.""" data = scanpy.read_h5ad(matrix) - data = self.prepare_query(data, organ, options) + data = self.prepare_query(data, organ, metadata["model"], options) popv.annotation.annotate_data( data, # TODO: onclass has been removed due to error in fast mode # seen_result_key is not added to the result in fast mode but still expected during compute_consensus # https://github.com/YosefLab/PopV/blob/main/popv/annotation.py#L64 # https://github.com/YosefLab/PopV/blob/main/popv/algorithms/_onclass.py#L199 - # methods=["knn_on_scvi", "scanvi", "svm", "rf", "celltypist"], + # Also excludes celltypist since web requests are not available inside the docker container methods=[ "knn_on_scvi", "scanvi", "svm", "rf", - ], # excludes celltypist for some HTTPS bug + ], ) - return data + return {"data": data, "organ_level": metadata["organ_level"]} def prepare_query( - self, data: scanpy.AnnData, organ: str, options: PopvOptions + self, data: scanpy.AnnData, organ: str, model: str, options: PopvOptions ) -> scanpy.AnnData: """Prepares the data to be annotated by popv. Args: data (scanpy.AnnData): Unprepared data organ (str): Organ name + model (str): Model name options (PopvOptions): Additional options Returns: scanpy.AnnData: Prepared data """ reference_data_path = self.find_reference_data( - options["reference_data_dir"], organ + options["reference_data_dir"], organ, model ) - model_path = self.find_model_dir(options["models_dir"], organ) + model_path = self.find_model_dir(options["models_dir"], organ, model) reference_data = scanpy.read_h5ad(reference_data_path) n_samples_per_label = self.get_n_samples_per_label(reference_data, options) data = self.normalize_var_names(data, options) @@ -122,12 +134,13 @@ def get_n_samples_per_label( n_samples_per_label = numpy.max((n_samples_per_label, t.cast(int, n))) return n_samples_per_label - def find_reference_data(self, dir: Path, organ: str) -> Path: + def find_reference_data(self, dir: Path, organ: str, model: str) -> Path: """Finds the reference data directory for an organ. Args: dir (Path): Directory to search organ (str): Organ name + model (str): Model name Raises: ValueError: If no reference data could be found @@ -140,7 +153,7 @@ def is_reference_data_candidate(path: Path): return ( path.is_file() and path.suffix == ".h5ad" - and organ.lower() in path.stem.lower() + and model.lower() in path.stem.lower() ) return self._find_in_dir( @@ -150,12 +163,13 @@ def is_reference_data_candidate(path: Path): f"Multiple reference data candidates for organ '{organ}'", ) - def find_model_dir(self, dir: Path, organ: str) -> Path: + def find_model_dir(self, dir: Path, organ: str, model: str) -> Path: """Find the model data directory for an organ. Args: dir (Path): Directory to search organ (str): Organ name + model (str): Model name Raises: ValueError: If no model data could be found @@ -165,7 +179,7 @@ def find_model_dir(self, dir: Path, organ: str) -> Path: """ def is_model_candidate(path: Path): - return path.is_dir() and organ.lower() in path.name.lower() + return path.is_dir() and model.lower() in path.name.lower() return self._find_in_dir( dir, diff --git a/containers/popv/context/organ-mapping.json b/containers/popv/context/organ-mapping.json deleted file mode 100644 index f4015fe..0000000 --- a/containers/popv/context/organ-mapping.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "UBERON:0001255": "Bladder", - "UBERON:0000178": "Blood", - "UBERON:0002371": "Bone_Marrow", - "UBERON:0000970": "Eye", - "UBERON:0004548": "Eye", - "UBERON:0004549": "Eye", - "UBERON:0000948": "Heart", - "UBERON:0000059": "Large_Intestine", - "UBERON:0002107": "Liver", - "UBERON:0002048": "Lung", - "UBERON:0001004": "Lung", - "UBERON:0000029": "Lymph_Node", - "UBERON:0002509": "Lymph_Node", - "UBERON:0001911": "Mammary", - "UBERON:0001264": "Pancreas", - "UBERON:0002367": "Prostate", - "UBERON:0000079": "Prostate", - "UBERON:0002097": "Skin", - "UBERON:0002108": "Small_Intestine", - "UBERON:0002106": "Spleen", - "UBERON:0002370": "Thymus", - "UBERON:0003126": "Trachea", - "UBERON:0000995": "Uterus", - "UBERON:0004537": "Vasculature" -} \ No newline at end of file diff --git a/containers/popv/context/organ-metadata.json b/containers/popv/context/organ-metadata.json new file mode 100644 index 0000000..a171b86 --- /dev/null +++ b/containers/popv/context/organ-metadata.json @@ -0,0 +1,98 @@ +{ + "UBERON:0001255": { + "model": "Bladder", + "organ_level": "urinary bladder" + }, + "UBERON:0000178": { + "model": "Blood", + "organ_level": "blood" + }, + "UBERON:0002371": { + "model": "Bone_Marrow", + "organ_level": "bone marrow" + }, + "UBERON:0000970": { + "model": "Eye", + "organ_level": "eye" + }, + "UBERON:0004548": { + "model": "Eye", + "organ_level": "eye" + }, + "UBERON:0004549": { + "model": "Eye", + "organ_level": "eye" + }, + "UBERON:0000948": { + "model": "Heart", + "organ_level": "heart" + }, + "UBERON:0000059": { + "model": "Large_Intestine", + "organ_level": "large intestine" + }, + "UBERON:0002107": { + "model": "Liver", + "organ_level": "liver" + }, + "UBERON:0002048": { + "model": "Lung", + "organ_level": "lung" + }, + "UBERON:0001004": { + "model": "Lung", + "organ_level": "respiratory system" + }, + "UBERON:0000029": { + "model": "Lymph_Node", + "organ_level": "lymph node" + }, + "UBERON:0002509": { + "model": "Lymph_Node", + "organ_level": "mesenteric lymph node" + }, + "UBERON:0001911": { + "model": "Mammary", + "organ_level": "mammary gland" + }, + "UBERON:0001264": { + "model": "Pancreas", + "organ_level": "pancreas" + }, + "UBERON:0002367": { + "model": "Prostate", + "organ_level": "prostate gland" + }, + "UBERON:0000079": { + "model": "Prostate", + "organ_level": "male reproductive system" + }, + "UBERON:0002097": { + "model": "Skin", + "organ_level": "skin" + }, + "UBERON:0002108": { + "model": "Small_Intestine", + "organ_level": "small intestine" + }, + "UBERON:0002106": { + "model": "Spleen", + "organ_level": "spleen" + }, + "UBERON:0002370": { + "model": "Thymus", + "organ_level": "thymus" + }, + "UBERON:0003126": { + "model": "Trachea", + "organ_level": "trachea" + }, + "UBERON:0000995": { + "model": "Uterus", + "organ_level": "uterus" + }, + "UBERON:0004537": { + "model": "Vasculature", + "organ_level": "blood vasculature" + } +} \ No newline at end of file diff --git a/src/algorithm/__init__.py b/src/algorithm/__init__.py index 5dc88a1..0bae51c 100644 --- a/src/algorithm/__init__.py +++ b/src/algorithm/__init__.py @@ -1,6 +1,5 @@ -from .algorithm import Algorithm +from .algorithm import Algorithm, RunResult from .arguments import add_common_arguments -from .organ import OrganLookup from .report import AlgorithmReport -__all__ = ["Algorithm", "AlgorithmReport", "OrganLookup", "add_common_arguments"] +__all__ = ["Algorithm", "AlgorithmReport", "RunResult", "add_common_arguments"] diff --git a/src/algorithm/algorithm.py b/src/algorithm/algorithm.py index ef30c39..2074f33 100644 --- a/src/algorithm/algorithm.py +++ b/src/algorithm/algorithm.py @@ -1,43 +1,42 @@ import abc +import json import typing as t from pathlib import Path import anndata -from .organ import OrganLookup from .report import AlgorithmReport -Organ = t.TypeVar("Organ") -Options = t.TypeVar("Options") +OrganMetadata = t.TypeVar("OrganMetadata", bound=dict) +Options = t.TypeVar("Options", bound=dict) AnnDataOrPath = t.Union[str, Path, anndata.AnnData] -RunResult = t.Union[AnnDataOrPath, t.Tuple[AnnDataOrPath, str]] -class Algorithm(t.Generic[Organ, Options], abc.ABC): +class RunResult(t.TypedDict): + data: AnnDataOrPath + organ_level: str + prediction_column: t.Optional[str] + + +class Algorithm(t.Generic[OrganMetadata, Options], abc.ABC): """An annotation algorithm. Attributes: - organ_lookup (t.Callable[[Path], OrganLookup[Organ]]): Callable to create an organ lookup prediction_column (t.Optional[str]): Column in annotated data with the predictions """ - def __init__( - self, - organ_lookup: t.Callable[[Path], OrganLookup[Organ]], - prediction_column: t.Optional[str] = None, - ): - self.organ_lookup = organ_lookup + def __init__(self, prediction_column: t.Optional[str] = None): self.prediction_column = prediction_column def run( self, matrix: Path, organ: str, - organ_mapping: Path, + organ_metadata: Path, output_matrix: Path, output_annotations: Path, output_report: Path, - **options, + **options: Options, ) -> AlgorithmReport: """Runs the algorithm to annotate data. @@ -54,17 +53,19 @@ def run( """ report = AlgorithmReport(output_matrix, output_annotations, output_report) try: - lookup = self.organ_lookup(organ_mapping) - result = self.do_run(matrix, lookup.get(organ), t.cast(Options, options)) - result = self.__post_process_result(result) - report.set_success(result) + metadata = self.__load_metadata(organ, organ_metadata) + result = self.do_run(matrix, organ, metadata, t.cast(Options, options)) + data = self.__post_process_result(result, organ, metadata) + report.set_success(data) except Exception as error: report.set_failure(error) return report @abc.abstractmethod - def do_run(self, matrix: Path, organ: Organ, options: Options) -> RunResult: + def do_run( + self, matrix: Path, organ: str, metadata: OrganMetadata, options: Options + ) -> RunResult: """Perform a annotation run. Must be overridden in subclasses. Args: @@ -74,28 +75,56 @@ def do_run(self, matrix: Path, organ: Organ, options: Options) -> RunResult: Returns: RunResult: - Annotated data either in-memory or a path to a h5ad. - Can also return a tuple where the first element is - the annotated data and the second element is the name - of the column that stores the predictions. + Annotated data either in-memory or a path to a h5ad, + the organ level to use in crosswalking, and + optionally the column storing predictions """ ... - def __post_process_result(self, result: RunResult) -> anndata.AnnData: + def __load_metadata(self, organ: str, organ_metadata: Path) -> OrganMetadata: + """Loads metadata for an organ from file. + + Args: + organ (str): Organ id + organ_metadata (Path): Path to metadata file + + Returns: + OrganMetadata: Organ specific metadata + """ + with open(organ_metadata) as file: + data = json.load(file) + return data[organ] + + def __post_process_result( + self, result: RunResult, organ: str, metadata: OrganMetadata + ) -> anndata.AnnData: """Normalize the result of a run. Args: - result (RunResult): Non-normalized result value + result (RunResult): Run result dictionary Returns: anndata.AnnData: Loaded h5ad data """ - prediction_column = self.prediction_column - if isinstance(result, tuple): - result, prediction_column = result - if isinstance(result, (str, Path)): - result = anndata.read_h5ad(result) - if prediction_column is not None and prediction_column in result.obs.columns: - result.obs["hra_prediction"] = result.obs[prediction_column] - - return result + + data = result["data"] + if isinstance(data, (str, Path)): + data = anndata.read_h5ad(data) + + prediction_column = result.get("prediction_column", self.prediction_column) + if prediction_column is None: + raise ValueError("Missing prediction column") + elif prediction_column not in data.obs.columns: + raise ValueError( + "Prediction column does not exist in the result", prediction_column + ) + else: + data.obs["hra_prediction"] = data.obs[prediction_column] + + data.uns["hra_organ_metadata"] = metadata + data.uns["hra_crosswalking"] = { + "organ_id": organ, + "organ_level": result["organ_level"], + } + + return data diff --git a/src/algorithm/arguments.py b/src/algorithm/arguments.py index 915a316..34562c6 100644 --- a/src/algorithm/arguments.py +++ b/src/algorithm/arguments.py @@ -20,10 +20,10 @@ def add_common_arguments( parser.add_argument("matrix", type=Path, help="h5ad data file") parser.add_argument("--organ", required=True, help="Organ uberon id") parser.add_argument( - "--organ-mapping", + "--organ-metadata", type=Path, - default="/organ-mapping.json", - help="Organ mapping file", + default="/organ-metadata.json", + help="Organ metadata file", ) parser.add_argument( "--output-matrix", diff --git a/src/algorithm/organ.py b/src/algorithm/organ.py deleted file mode 100644 index b8417e4..0000000 --- a/src/algorithm/organ.py +++ /dev/null @@ -1,80 +0,0 @@ -import dataclasses -import json -import logging -import typing as t -from pathlib import Path - -Organ = t.TypeVar("Organ") - - -@dataclasses.dataclass -class OrganLookup(t.Generic[Organ]): - """Lookup from raw organ name to algorithm specific organ data. - - Attributes: - mapping_file (Path): Path to file mapping raw organ name to data - """ - - mapping_file: Path - - def get(self, id: str) -> Organ: - """Get the algorithm specific data for a raw organ name. - - Args: - id (str): Organ uberon id - - Raises: - ValueError: If the organ is not supported by the algorithm - - Returns: - Organ: Algorithm specific data - """ - for key, organ in self.__get_options(): - if key.lower() == id.lower(): - return organ - raise ValueError(f"Organ '{id}' is not supported") - - def get_builtin_options(self) -> t.Iterable[t.Tuple[str, Organ]]: - """Get builtin organ mapping options. - - Returns: - t.Iterable[t.Tuple[str, Organ]]: Entries mapping organ to data - """ - return [] - - def from_raw(self, raw: t.Any) -> Organ: - """Convert a raw mapping value to algorithm specific data. - Can be overridden in subclasses. - - Args: - raw (t.Any): Raw value from the mapping file - - Returns: - Organ: Converted organ data - """ - return raw - - def __get_options(self) -> t.Iterable[t.Tuple[str, Organ]]: - """Gets all options, builtin and from the mapping file. - - Yields: - Iterator[t.Tuple[str, Organ]]: Each entry from builtin and the mapping file - """ - yield from self.get_builtin_options() - try: - for key, value in self.__load_mapping_file(): - yield key, self.from_raw(value) - except ValueError: - logging.warn(f"Invalid format of organ mapping file '{self.mapping_file}'") - - def __load_mapping_file(self) -> t.Iterable[t.Tuple[str, t.Any]]: - """Load the mapping json file. - - Yields: - Iterator[t.Tuple[str, t.Any]]: Each entry in the mapping - """ - if not self.mapping_file.exists() or not self.mapping_file.is_file(): - return - with open(self.mapping_file) as file: - data = json.load(file) - yield from data.items() if isinstance(data, dict) else data diff --git a/steps/check_annotation_report.cwl b/steps/check_annotation_report.cwl index f226ef6..63c54bd 100644 --- a/steps/check_annotation_report.cwl +++ b/steps/check_annotation_report.cwl @@ -16,6 +16,6 @@ outputs: expression: | ${ - var isSuccess = /success/g.test(inputs.report.contents); + var isSuccess = /"status": "success"/g.test(inputs.report.contents); return { matrix_or_null: isSuccess ? inputs.matrix : null }; }