Register files from Census release 2023-12-06ΒΆ

import lamindb as ln
import lnschema_bionty as lb
import pandas as pd
2023-12-13 12:03:04,521:INFO - NumExpr defaulting to 2 threads.
πŸ’‘ lamindb instance: laminlabs/cellxgene-latest
census_version = "2023-12-06"
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path).view_tree()
h5ads (0 sub-directories & 1139 files with suffixes '.h5ad'): 
β”œβ”€β”€ 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
β”œβ”€β”€ 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
β”œβ”€β”€ 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
β”œβ”€β”€ 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
β”œβ”€β”€ 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
β”œβ”€β”€ 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...
ln.context.track()
πŸ’‘ notebook imports: lamindb==0.64.0 lnschema_bionty==0.36.1 pandas==1.4.4 requests==2.31.0
πŸ’‘ loaded: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
πŸ’‘ loaded: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 12:03:11 UTC, transform_id=1, created_by_id=2)

Register artifacts (files)ΒΆ

artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)
dataset = ln.Dataset(artifacts, name="cellxgene-census", version=census_version)
dataset.save()

Register metadataΒΆ

Get all datasets and associated metadata using cellxgene REST API:

import requests


def get_datasets_df_from_cxg():
    api_url_base = "https://api.cellxgene.cziscience.com"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    res_content = res.json()
    return res_content
cellxgene_meta = get_datasets_df_from_cxg()
len(cellxgene_meta)
1152
cellxgene_meta[0].keys()
dict_keys(['assay', 'assets', 'batch_condition', 'cell_count', 'cell_type', 'citation', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])

Register new features and parent labelsΒΆ

obs_features = {
    "assay": "bionty.ExperimentalFactor",
    "cell_type": "bionty.CellType",
    "development_stage": "bionty.DevelopmentalStage",
    "disease": "bionty.Disease",
    "donor_id": "core.ULabel",
    "self_reported_ethnicity": "bionty.Ethnicity",
    "sex": "bionty.Phenotype",
    "suspension_type": "core.ULabel",
    "tissue": "bionty.Tissue",
    "tissue_type": "core.ULabel",
}

obs_features_records = []
for name, registry in obs_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.artifacts.set(artifacts, through_defaults={"slot": "obs"})
ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}

ext_features_records = []
for name, registry in ext_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.artifacts.set(artifacts, through_defaults={"slot": "external"})
ln.ULabel(name="is_collection", description="parents of collections").save()
ln.ULabel(name="is_donor", description="parents of donors").save()
ln.ULabel(name="is_suspension_type", description="parents of suspension types").save()
ln.ULabel(name="is_tissue_type", description="parents of tissue types").save()
features = ln.Feature.lookup()
artifacts = ln.File.filter(key__contains=census_version).all()

collections, organismsΒΆ

# register all collections
is_collection = ln.ULabel.filter(name="is_collection").one()
collections_meta = set()
for dataset_meta in cellxgene_meta:
    collections_meta.add(
        (
            dataset_meta["collection_name"],
            dataset_meta["collection_doi"],
            dataset_meta["collection_id"],
        )
    )

collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
    collection = ln.ULabel(
        name=collection_name,
        description=collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections_records.append(collection)
ln.save(collections_records)
is_collection.children.add(*collections_records)
# register all organisms
ncbitaxon_source = lb.BiontySource.filter(source="ncbitaxon").one()

organisms_meta = set()
for dataset_meta in cellxgene_meta:
    organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})

organisms_records = lb.Organism.from_values(
    organisms_meta, field=lb.Organism.ontology_id, bionty_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
    if r.name == "house mouse":
        r.name = "mouse"
ln.save(organisms_records, parents=False)

Link collections and organisms to artifacts:

ext_feature_set = ln.FeatureSet.filter(name="external features").one()
ext_features = ext_feature_set.members.lookup()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()

for dataset_meta in cellxgene_meta:
    # get registered file record based on dataset_id
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue

    # register collection
    collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
    file.labels.add(collection, feature=ext_features.collection)

    # register organism
    organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
    organism_records = lb.Organism.filter(ontology_id__in=organism_ontology_ids).list()
    file.labels.add(organism_records, feature=ext_features.organism)

ontologiesΒΆ

Register all ontology ids:

from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm

obs_feature_set = ln.FeatureSet.filter(name="obs features").one()
obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
    feature = getattr(obs_features_records, name)
    accessor = ACCESSORS.get(feature.registries)
    orm = getattr(ln.File, accessor).field.model
    # TODO: ulabels are defined in the File model, improve this in LaminDB
    if orm == ln.File:
        orm = getattr(ln.File, accessor).field.related_model
    FEATURE_TO_ACCESSOR[name] = (accessor, orm)


def create_ontology_record_from_source(
    ontology_id: str,
    from_orm: Registry,
    target_orm: Registry,
    bionty_source: Optional[lb.BiontySource] = None,
):
    from_record = from_orm.from_bionty(
        ontology_id=ontology_id, bionty_source=bionty_source
    )
    try:
        target_record = target_orm(
            name=from_record.name,
            description=from_record.description,
            ontology_id=from_record.ontology_id,
            bionty_source_id=from_record.bionty_source_id,
        )
        return target_record
    except Exception:
        pass
obs_features.keys()
dict_keys(['assay', 'cell_type', 'development_stage', 'disease', 'donor_id', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'tissue_type'])
ln.settings.upon_create_search_names = False

ontology_ids = {}
for name in obs_features.keys():
    if name in ["donor_id", "suspension_type", "tissue_type"]:
        continue
    allids = set()
    for i in cellxgene_meta:
        if name in i:
            allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])

    ontology_ids[name] = allids

bionty_source_ds_mouse = lb.BiontySource.filter(
    entity="DevelopmentalStage", organism="mouse"
).one()
bionty_source_pato = lb.BiontySource.filter(source="pato").one()

# register all ontology ids
for name, terms in ontology_ids.items():
    print(f"registering {name}")
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    records = orm.from_values(terms_ids, field="ontology_id")
    if len(records) > 0:
        ln.save(records)
    inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
    if len(inspect_result.non_validated) > 0:
        if name == "development_stage":
            records = orm.from_values(
                inspect_result.non_validated,
                field="ontology_id",
                bionty_source=bionty_source_ds_mouse,
            )
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id, from_orm=lb.Tissue, target_orm=orm
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("UBERON:")
            ]
            records += [
                orm(name=term_id, ontology_id=term_id)
                for term_id in inspect_result.non_validated
                if term_id == "unknown"
            ]
        else:
            records = [
                orm(name=term[0], ontology_id=term[1])
                for term in terms
                if (not term[1].startswith("PATO:"))
                and (term[1] in inspect_result.non_validated)
            ]
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id,
                    from_orm=lb.Phenotype,
                    target_orm=orm,
                    bionty_source=bionty_source_pato,
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("PATO:")
            ]

        if len(records) > 0:
            print(f"registered {len(records)} records: {records}")
            ln.save(records)
Hide code cell output
registering assay
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registering cell_type
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registering development_stage
❗ did not create DevelopmentalStage records for 57 non-validated ontology_ids: 'MmusDv:0000021', 'MmusDv:0000024', 'MmusDv:0000025', 'MmusDv:0000026', 'MmusDv:0000027', 'MmusDv:0000028', 'MmusDv:0000029', 'MmusDv:0000032', 'MmusDv:0000033', 'MmusDv:0000034', 'MmusDv:0000035', 'MmusDv:0000036', 'MmusDv:0000037', 'MmusDv:0000041', 'MmusDv:0000046', 'MmusDv:0000048', 'MmusDv:0000049', 'MmusDv:0000050', 'MmusDv:0000051', 'MmusDv:0000052', ...
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
❗ did not create DevelopmentalStage records for 6 non-validated ontology_ids: 'UBERON:0000113', 'UBERON:0007220', 'UBERON:0007222', 'UBERON:0018241', 'UBERON:0034919', 'unknown'
registered 57 records: [DevelopmentalStage(uid='oorzGtyN', name='24 weeks', ontology_id='MmusDv:0000074', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 168 Days And Under 176 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='fiRoa3XX', name='22 weeks', ontology_id='MmusDv:0000072', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 154 Days And Under 162 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='W2bOwIcG', name='18 month-old stage', ontology_id='MmusDv:0000089', description='Aged Adult Stage That Refers To A Mouse Which Is Over 18 And Under 19 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='jOuS6eot', name='Theiler stage 24', ontology_id='MmusDv:0000033', synonyms='E16|TS24', description='Fetal Stage During Which The Umbilical Hernia Disappears And There Is A Corresponding Increase In The Size Of The Peritoneal Sac.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='cGDi3WbC', name='8 weeks', ontology_id='MmusDv:0000052', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 56 Days And Under 64 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='0QLIsnxe', name='4 month-old stage', ontology_id='MmusDv:0000064', description='Early Adult Stage That Refers To A Mouse Which Is Over 4 And Under 5 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='DgGRffIB', name='3 month-old stage', ontology_id='MmusDv:0000063', description='Early Adult Stage That Refers To A Mouse Which Is Over 3 And Under 4 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='BHHRJHHR', name='12 weeks', ontology_id='MmusDv:0000056', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 84 Days And Under 92 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='b8YHi0Nc', name='11 weeks', ontology_id='MmusDv:0000055', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 77 Days And Under 85 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='o9MDgZ6M', name='Theiler stage 19', ontology_id='MmusDv:0000026', synonyms='TS19|E11-12.25', description='Organogenesis Stage During Which The Lens Vesicle Becomes Completely Closed And Detached From The Ectoderm, And Peripheral Margins Of The Eye Become Well Defined.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='IZx9mYT1', name='Theiler stage 21', ontology_id='MmusDv:0000028', synonyms='E12.5-14|TS21', description='Organogenesis Stage During Which The Digit Widths And Locations Can Be Discerned, And The Pinna Rapidly Develops And Forms A Crest At Right Angles To The Head.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='jl2j0dxn', name='6 weeks', ontology_id='MmusDv:0000050', description='Early Stage That Refers To A Mouse Which Is Over 42 Days And Under 50 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Z0CP8fi0', name='4-7 days', ontology_id='MmusDv:0000113', description='Premature Stage That Refers To The Newborn Mouse Which Is Over 4 Days And Under 8 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='H0YqN4ng', name='9 weeks', ontology_id='MmusDv:0000053', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 63 Days And Under 71 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='tJ45g0Sb', name='unknown', ontology_id='MmusDv:0000041', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='giDZt5Yz', name='19 weeks', ontology_id='MmusDv:0000068', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 133 Days And Under 141 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='uDB0Ob7B', name='Theiler stage 28', ontology_id='MmusDv:0000037', description='Postnatal Development That Covers The Period After The Ts27 Stage (P0-P3 First Days Of Life) And Continuing To Adulthood. P4 To Adulthood.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='YF8WBL2Q', name='15 weeks', ontology_id='MmusDv:0000059', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 105 Days And Under 113 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='8MuTFQO8', name='13 weeks', ontology_id='MmusDv:0000057', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 91 Days And Under 99 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='BzdTRbz2', name='14 weeks', ontology_id='MmusDv:0000058', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 98 Days And Under 106 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Y4EsPXsh', name='10 weeks', ontology_id='MmusDv:0000054', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 70 Days And Under 78 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='msz7ilUG', name='26 weeks', ontology_id='MmusDv:0000099', description='6 Month-Old Stage That Refers To A Mouse Which Is Over 182 Days And Under 190 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Ni1mjjFg', name='mature stage', ontology_id='MmusDv:0000110', synonyms='mature', description='Mouse Developmental Stage That Refers To A Sexually Mature Adult Mouse Which Is Over 6 Weeks Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='8M0BbvJc', name='23 weeks', ontology_id='MmusDv:0000073', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 161 Days And Under 169 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='df9sEkEI', name='early adult stage', ontology_id='MmusDv:0000061', description='Mature Stage That Refers To A Adult Mouse Which Is Over 6 Weeks And Under 7 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='qqSg01zz', name='20 weeks', ontology_id='MmusDv:0000070', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 140 Days And Under 148 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='ct1TGudU', name='Theiler stage 26', ontology_id='MmusDv:0000035', synonyms='TS26|E18', description='Fetal Stage Defined By Long Whiskers And During Which The Eyes Are Barely Visible Through The Closed Eyelids.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='vjWmEEv6', name='2 weeks', ontology_id='MmusDv:0000046', description='Premature Stage That Refers To A Mouse Which Is Over 14 Days And Under 22 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='U7yEA0lB', name='Theiler stage 27', ontology_id='MmusDv:0000036', description='Stage That Refers To The Newborn Mouse, Aged E19-20, P0. Description Of Anatomical Structures Related To This Stage Corresponds To The First Days Of The Mouse Life. Used For Postnatal Days 0 Through 3.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='TkE1VPQ6', name='7 weeks', ontology_id='MmusDv:0000051', description='Early Adult Stage That Refers To A Mouse Which Is Over 49 Days And Under 57 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='O7areJVw', name='5 month-old stage', ontology_id='MmusDv:0000069', description='Early Adult Stage That Refers To A Mouse Which Is Over 5 And Under 6 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='jym1WIjI', name='2 month-old stage', ontology_id='MmusDv:0000062', description='Early Adult Stage That Refers To A Mouse Which Is Over 2 And Under 3 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='PYPUpHe0', name='29 weeks', ontology_id='MmusDv:0000102', description='6 Month-Old Stage That Refers To A Mouse Which Is Over 203 Days And Under 211 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='loNJxGq0', name='Theiler stage 14', ontology_id='MmusDv:0000021', synonyms='TS14|E8.5-9.75', description='Organogenesis Stage During Which The Rostral Extremity Of The Neural Tube Closes In Embryos With Usually About 15-18 Somite Pairs. Late In The Stage The 3Rd Branchial Arch Becomes Visible.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='6N4RtxSR', name='Theiler stage 25', ontology_id='MmusDv:0000034', synonyms='TS25|E17', description='Fetal Stage During Which The Thickened Skin Forms Wrinkles And The Subcutaneous Veins Are Less Visible.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Zlsx8Dmt', name='18 weeks', ontology_id='MmusDv:0000067', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 126 Days And Under 134 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='asteE00k', name='20 month-old stage and over', ontology_id='MmusDv:0000091', description='Aged Adult Stage That Refers To A Mouse Which Is Over 20 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='IFN01quy', name='21 weeks', ontology_id='MmusDv:0000071', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 147 Days And Under 155 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='K7pFaQGM', name='25 weeks', ontology_id='MmusDv:0000098', description='6 Month-Old Stage That Refers To A Mouse Which Is Over 175 Days And Under 183 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='yOlwr0TC', name='4 weeks', ontology_id='MmusDv:0000048', description='Premature Stage That Refers To A Mouse Which Is Over 28 Days And Under 36 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='37VdjVb3', name='5 weeks', ontology_id='MmusDv:0000049', description='Premature Stage That Refers To A Mouse Which Is Over 35 Days And Under 43 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='070cWUIr', name='Theiler stage 22', ontology_id='MmusDv:0000029', synonyms='E13.5-15|TS22', description='Organogenesis Stage During Which The Fingers Are Clearly Visible And The Long Bones Of The Limbs Are Present.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='yWOXIv07', name='Theiler stage 17', ontology_id='MmusDv:0000024', synonyms='TS17|E10-11.25', description='Organogenesis Stage Defined By The Deepening Of The Lens Pit And The First Appearance Of The Physiological Umbilical Hernia.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Xsadx8yg', name='8 month-old stage', ontology_id='MmusDv:0000079', description='Middle Aged Stage That Refers To A Mouse Which Is Over 8 And Under 9 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='wlyIb2NL', name='16 weeks', ontology_id='MmusDv:0000065', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 112 Days And Under 120 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='VEhMtYDB', name='Theiler stage 18', ontology_id='MmusDv:0000025', synonyms='E10.5-11.25|TS18', description='Organogenesis Stage During Which The Lens Vesicle Gradually Closes And The Nasal Pits Start To Form. The Rapid Growth Of The Brain Is Striking.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='QWeKuumK', name='Theiler stage 23', ontology_id='MmusDv:0000032', synonyms='E15|TS23', description='Fetal Stage During Which The Toes Separate And Hair Follicles Are Present In The Cephalic Region But Not At The Periphery Of The Vibrissae.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='p538b004', name='Theiler stage 20', ontology_id='MmusDv:0000027', synonyms='E11.5-13|TS20', description='Organogenesis Stage During Which The Handplate Develops Angles Corresponding To The Future Digits, And Tongue And Brain Vesicles Are Clearly Visible.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='HUrILEBs', name='16 month-old stage', ontology_id='MmusDv:0000087', description='Aged Adult Stage That Refers To A Mouse Which Is Over 16 And Under 17 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='PzAvRdXi', name='6 month-old stage', ontology_id='MmusDv:0000077', description='Early Adult Stage That Refers To A Mouse Which Is Over 6 And Under 7 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Dx800l8t', name='17 weeks', ontology_id='MmusDv:0000066', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 119 Days And Under 127 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='lNh8U4YZ', name='late embryonic stage', ontology_id='UBERON:0007220', description='An Embryo Stage That Covers Late Steps Of The Embryogenesis With A Fully Formed Embryo Still Developing Before Birth Or Egg Hatching.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='wksJWjer', name='prime adult stage', ontology_id='UBERON:0018241', description='A Life Cycle Stage That Starts At Completion Of Development And Growth Of The Sexually Mature Adult Animal, And Ends Before Senescence.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='hqyIKjfF', name='late adult stage', ontology_id='UBERON:0007222', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='l00DTC4g', name='juvenile stage', ontology_id='UBERON:0034919', description='The Stage Of Being No More Dependent Of The Nest And/Or From Caregivers For Subsistence While Having Not Reach Sexual Maturity.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='GDaE3j6Z', name='post-juvenile adult stage', ontology_id='UBERON:0000113', description='The Stage Of Being A Sexually Mature Adult Animal.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=2)]
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registering disease
❗ did not create Disease record for 1 non-validated ontology_id: 'PATO:0000461'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 1 records: [Disease(uid='4r2nqggf', name='normal', ontology_id='PATO:0000461', description='A Quality Inhering In A Bearer By Virtue Of The Bearer'S Exhibiting No Deviation From Normal Or Average.', bionty_source_id=43, created_by_id=2)]
registering self_reported_ethnicity
❗ did not create Ethnicity records for 3 non-validated ontology_ids: 'multiethnic', 'na', 'unknown'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 3 records: [Ethnicity(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=2), Ethnicity(uid='UY1fNAFT', name='na', ontology_id='na', created_by_id=2), Ethnicity(uid='8lAgy5Ej', name='multiethnic', ontology_id='multiethnic', created_by_id=2)]
registering sex
❗ did not create Phenotype records for 3 non-validated ontology_ids: 'PATO:0000383', 'unknown', 'PATO:0000384'
registered 3 records: [Phenotype(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=2), Phenotype(uid='hSl0sSF0', name='female', ontology_id='PATO:0000383', description='A Biological Sex Quality Inhering In An Individual Or A Population That Only Produces Gametes That Can Be Fertilised By Male Gametes.', bionty_source_id=43, created_by_id=2), Phenotype(uid='Pl1UiuS0', name='male', ontology_id='PATO:0000384', description='A Biological Sex Quality Inhering In An Individual Or A Population Whose Sex Organs Contain Only Male Gametes.', bionty_source_id=43, created_by_id=2)]
registering tissue
❗ did not create Tissue records for 17 non-validated ontology_ids: 'CL:0000010 (cell culture)', 'CL:0000082 (cell culture)', 'CL:0000084 (cell culture)', 'CL:0000115 (cell culture)', 'CL:0000351 (cell culture)', 'CL:0002322 (cell culture)', 'CL:0002327 (cell culture)', 'CL:0002328 (cell culture)', 'CL:0002334 (cell culture)', 'CL:0002335 (cell culture)', 'CL:0002633 (cell culture)', 'CL:0010003 (cell culture)', 'UBERON:0000088 (organoid)', 'UBERON:0000966 (organoid)', 'UBERON:0001295 (organoid)', 'UBERON:0002048 (organoid)', 'UBERON:0002370 (organoid)'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 17 records: [Tissue(uid='vg9s890t', name='respiratory basal cell (cell culture)', ontology_id='CL:0002633 (cell culture)', created_by_id=2), Tissue(uid='lfIFQFR5', name='epithelial cell of lung (cell culture)', ontology_id='CL:0000082 (cell culture)', created_by_id=2), Tissue(uid='rIPA0OEl', name='T cell (cell culture)', ontology_id='CL:0000084 (cell culture)', created_by_id=2), Tissue(uid='x3tRcugV', name='trophoblast (organoid)', ontology_id='UBERON:0000088 (organoid)', created_by_id=2), Tissue(uid='uS0Cw8zN', name='retina (organoid)', ontology_id='UBERON:0000966 (organoid)', created_by_id=2), Tissue(uid='kWD0kb5x', name='brown preadipocyte (cell culture)', ontology_id='CL:0002335 (cell culture)', created_by_id=2), Tissue(uid='UoElNxsj', name='endothelial cell (cell culture)', ontology_id='CL:0000115 (cell culture)', created_by_id=2), Tissue(uid='7MzqN14b', name='bronchial epithelial cell (cell culture)', ontology_id='CL:0002328 (cell culture)', created_by_id=2), Tissue(uid='RkE6D8y1', name='endometrium (organoid)', ontology_id='UBERON:0001295 (organoid)', created_by_id=2), Tissue(uid='yPk6E1V8', name='epithelial cell of alveolus of lung (cell culture)', ontology_id='CL:0010003 (cell culture)', created_by_id=2), Tissue(uid='K4RSNRBc', name='thymus (organoid)', ontology_id='UBERON:0002370 (organoid)', created_by_id=2), Tissue(uid='9ICArUMH', name='embryonic stem cell (cell culture)', ontology_id='CL:0002322 (cell culture)', created_by_id=2), Tissue(uid='WSs6UA9e', name='lung (organoid)', ontology_id='UBERON:0002048 (organoid)', created_by_id=2), Tissue(uid='w6gzNa8D', name='mammary gland epithelial cell (cell culture)', ontology_id='CL:0002327 (cell culture)', created_by_id=2), Tissue(uid='Ash8pGf8', name='trophoblast cell (cell culture)', ontology_id='CL:0000351 (cell culture)', created_by_id=2), Tissue(uid='CevFMDqD', name='preadipocyte (cell culture)', ontology_id='CL:0002334 (cell culture)', created_by_id=2), Tissue(uid='9YB5clqY', name='cultured cell (cell culture)', ontology_id='CL:0000010 (cell culture)', created_by_id=2)]

donors and suspension_typesΒΆ

donor_ids = set()
suspension_types = set()

for i in cellxgene_meta:
    if "donor_id" in i:
        donor_ids.update(i["donor_id"])
    if "suspension_type" in i:
        suspension_types.update(i["suspension_type"])
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor.children.add(*new_donors)

is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type.children.add(*new_stypes)

Annotate artifacts with obs metadataΒΆ

FEATURE_TO_ACCESSOR
{'assay': ('experimental_factors', lnschema_bionty.models.ExperimentalFactor),
 'cell_type': ('cell_types', lnschema_bionty.models.CellType),
 'development_stage': ('developmental_stages',
  lnschema_bionty.models.DevelopmentalStage),
 'disease': ('diseases', lnschema_bionty.models.Disease),
 'donor_id': ('ulabels', lnschema_core.models.ULabel),
 'self_reported_ethnicity': ('ethnicities', lnschema_bionty.models.Ethnicity),
 'sex': ('phenotypes', lnschema_bionty.models.Phenotype),
 'suspension_type': ('ulabels', lnschema_core.models.ULabel),
 'tissue': ('tissues', lnschema_bionty.models.Tissue),
 'tissue_type': ('ulabels', lnschema_core.models.ULabel)}
features = ln.Feature.lookup()

for idx, dataset_meta in enumerate(cellxgene_meta):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cellxgene_meta)}")
    file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue
    for field, terms in dataset_meta.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type", "tissue_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                file.labels.add(records, feature=getattr(features, field))
        else:
            records = orm.from_values(
                [i["ontology_term_id"] for i in terms], field="ontology_id"
            )
            if len(records) > 0:
                getattr(file, accessor).add(*records)
Hide code cell output
annotating dataset 0 of 1152
annotating dataset 100 of 1152
annotating dataset 200 of 1152
annotating dataset 300 of 1152
annotating dataset 400 of 1152
annotating dataset 500 of 1152
annotating dataset 600 of 1152
annotating dataset 700 of 1152
annotating dataset 800 of 1152
annotating dataset 900 of 1152
annotating dataset 1000 of 1152
annotating dataset 1100 of 1152

Clean up the 2 β€œunknowns” in DevelopmentalStage:

lb.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()

Validate and register genesΒΆ

# register synthetic constructs and sars_cov_2 as new organisms
new_organisms = lb.Organism.from_values(
    ["NCBITaxon:32630", "NCBITaxon:2697049"],
    field=lb.Organism.ontology_id,
    bionty_source=ncbitaxon_source,
)
ln.save(new_organisms, parents=False)

# genes files
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
genes_files = {
    "homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
    "mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
    "synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
    "severe_acute_respiratory_syndrome_coronavirus_2": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_sars_cov_2.csv.gz",
}

Register all genes for each organism:

for organism_name, genes_file in genes_files.items():
    print(f"registering {organism_name} genes")
    df = pd.read_csv(genes_file, header=None, index_col=0)
    organism_record = getattr(organisms, organism_name)
    gene_records = lb.Gene.from_values(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    ln.save(gene_records)
    validated = lb.Gene.validate(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    # register legacy genes manually
    new_records = []
    for gene_id in df.index[~validated]:
        new_records.append(
            lb.Gene(
                ensembl_gene_id=gene_id,
                symbol=df.loc[gene_id][1],
                organism=organism_record,
            )
        )
    ln.save(new_records)

    genes_feature_set = ln.FeatureSet(
        features=gene_records + new_records, name=f"all {organism_record.name} genes"
    )
    genes_feature_set.save()
Hide code cell output
registering homo_sapiens genes
❗ did not create Gene records for 147 non-validated ensembl_gene_ids: 'ENSG00000112096', 'ENSG00000137808', 'ENSG00000161149', 'ENSG00000182230', 'ENSG00000203812', 'ENSG00000204092', 'ENSG00000205485', 'ENSG00000212951', 'ENSG00000215271', 'ENSG00000221995', 'ENSG00000224739', 'ENSG00000224745', 'ENSG00000225178', 'ENSG00000225932', 'ENSG00000226377', 'ENSG00000226380', 'ENSG00000226403', 'ENSG00000227021', 'ENSG00000227220', 'ENSG00000227902', ...
❗ 147 terms (0.20%) are not validated for ensembl_gene_id: ENSG00000269933, ENSG00000261737, ENSG00000259834, ENSG00000256374, ENSG00000263464, ENSG00000203812, ENSG00000272196, ENSG00000272880, ENSG00000284299, ENSG00000270188, ENSG00000287116, ENSG00000237133, ENSG00000224739, ENSG00000227902, ENSG00000239467, ENSG00000272551, ENSG00000280374, ENSG00000284741, ENSG00000236886, ENSG00000229352, ...
registering mus_musculus genes
❗ did not create Gene records for 135 non-validated ensembl_gene_ids: 'ENSMUSG00000022591', 'ENSMUSG00000045506', 'ENSMUSG00000053706', 'ENSMUSG00000053861', 'ENSMUSG00000066378', 'ENSMUSG00000066810', 'ENSMUSG00000066936', 'ENSMUSG00000067085', 'ENSMUSG00000067122', 'ENSMUSG00000067292', 'ENSMUSG00000067627', 'ENSMUSG00000067929', 'ENSMUSG00000068181', 'ENSMUSG00000069518', 'ENSMUSG00000072693', 'ENSMUSG00000073290', 'ENSMUSG00000073291', 'ENSMUSG00000073682', 'ENSMUSG00000074210', 'ENSMUSG00000074302', ...
❗ 135 terms (0.20%) are not validated for ensembl_gene_id: ENSMUSG00000022591, ENSMUSG00000094127, ENSMUSG00000066936, ENSMUSG00000116275, ENSMUSG00000091312, ENSMUSG00000098794, ENSMUSG00000079353, ENSMUSG00000096240, ENSMUSG00000079286, ENSMUSG00000085431, ENSMUSG00000075015, ENSMUSG00000075014, ENSMUSG00000078091, ENSMUSG00000075006, ENSMUSG00000079175, ENSMUSG00000079171, ENSMUSG00000079170, ENSMUSG00000079169, ENSMUSG00000090353, ENSMUSG00000100963, ...
registering synthetic_construct genes
❗ loading non-default source inside a LaminDB instance
❗ no Bionty source found, skipping Bionty validation
❗ loading non-default source inside a LaminDB instance
❗ did not create Gene records for 92 non-validated ensembl_gene_ids: 'ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009', 'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016', 'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024', 'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033', 'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040', ...
❗ 92 terms (100.00%) are not validated for ensembl_gene_id: ERCC-00002, ERCC-00003, ERCC-00004, ERCC-00009, ERCC-00012, ERCC-00013, ERCC-00014, ERCC-00016, ERCC-00017, ERCC-00019, ERCC-00022, ERCC-00024, ERCC-00025, ERCC-00028, ERCC-00031, ERCC-00033, ERCC-00034, ERCC-00035, ERCC-00039, ERCC-00040, ...
registering severe_acute_respiratory_syndrome_coronavirus_2 genes
❗ loading non-default source inside a LaminDB instance
❗ no Bionty source found, skipping Bionty validation
❗ loading non-default source inside a LaminDB instance
❗ did not create Gene records for 12 non-validated ensembl_gene_ids: 'ENSSASG00005000002', 'ENSSASG00005000003', 'ENSSASG00005000004', 'ENSSASG00005000006', 'ENSSASG00005000010', 'ENSSASG00005000007', 'ENSSASG00005000011', 'ENSSASG00005000009', 'ENSSASG00005000012', 'ENSSASG00005000008', 'ENSSASG00005000005', 'ENSSASG00005000013'
❗ 12 terms (100.00%) are not validated for ensembl_gene_id: ENSSASG00005000002, ENSSASG00005000003, ENSSASG00005000004, ENSSASG00005000006, ENSSASG00005000010, ENSSASG00005000007, ENSSASG00005000011, ENSSASG00005000009, ENSSASG00005000012, ENSSASG00005000008, ENSSASG00005000005, ENSSASG00005000013

Annotate tissue_typeΒΆ

Before CxG schema 4.0, tissue_type column was not annotated, instead β€œcell culture” or β€œorganoid” was added to the record ontology_id.

tissue_types = [ln.ULabel(name=i) for i in ["tissue", "organoid", "cell culture"]]
ln.save(tissue_types)

is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
is_tissue_type.children.add(*tissue_types)
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.lookup()
features = ln.Feature.lookup()
organoids = lb.Tissue.filter(ontology_id__contains="organoid").all()
organoids.df()
uid name ontology_id abbr synonyms description bionty_source_id updated_at created_by_id
id
692 x3tRcugV trophoblast (organoid) UBERON:0000088 (organoid) None None None None 2023-12-11 19:11:55.832890+00:00 2
693 uS0Cw8zN retina (organoid) UBERON:0000966 (organoid) None None None None 2023-12-11 19:11:55.832921+00:00 2
697 RkE6D8y1 endometrium (organoid) UBERON:0001295 (organoid) None None None None 2023-12-11 19:11:55.833155+00:00 2
699 K4RSNRBc thymus (organoid) UBERON:0002370 (organoid) None None None None 2023-12-11 19:11:55.833223+00:00 2
701 WSs6UA9e lung (organoid) UBERON:0002048 (organoid) None None None None 2023-12-11 19:11:55.833293+00:00 2
for record in organoids:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.Tissue.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.organoid, features.tissue_type)
trophoblast (organoid)
retina (organoid)
endometrium (organoid)
thymus (organoid)
lung (organoid)
organoids.delete()
(10, {'lnschema_bionty.Tissue_files': 5, 'lnschema_bionty.Tissue': 5})
cell_cultures = lb.Tissue.filter(ontology_id__contains="cell culture").all()
cell_cultures.df()
uid name ontology_id abbr synonyms description bionty_source_id updated_at created_by_id
id
691 rIPA0OEl T cell (cell culture) CL:0000084 (cell culture) None None None None 2023-12-11 19:11:55.832859+00:00 2
689 vg9s890t respiratory basal cell (cell culture) CL:0002633 (cell culture) None None None None 2023-12-11 19:11:55.832782+00:00 2
690 lfIFQFR5 epithelial cell of lung (cell culture) CL:0000082 (cell culture) None None None None 2023-12-11 19:11:55.832827+00:00 2
694 kWD0kb5x brown preadipocyte (cell culture) CL:0002335 (cell culture) None None None None 2023-12-11 19:11:55.833031+00:00 2
695 UoElNxsj endothelial cell (cell culture) CL:0000115 (cell culture) None None None None 2023-12-11 19:11:55.833064+00:00 2
696 7MzqN14b bronchial epithelial cell (cell culture) CL:0002328 (cell culture) None None None None 2023-12-11 19:11:55.833122+00:00 2
698 yPk6E1V8 epithelial cell of alveolus of lung (cell cult... CL:0010003 (cell culture) None None None None 2023-12-11 19:11:55.833189+00:00 2
700 9ICArUMH embryonic stem cell (cell culture) CL:0002322 (cell culture) None None None None 2023-12-11 19:11:55.833256+00:00 2
702 w6gzNa8D mammary gland epithelial cell (cell culture) CL:0002327 (cell culture) None None None None 2023-12-11 19:11:55.833349+00:00 2
703 Ash8pGf8 trophoblast cell (cell culture) CL:0000351 (cell culture) None None None None 2023-12-11 19:11:55.833382+00:00 2
704 CevFMDqD preadipocyte (cell culture) CL:0002334 (cell culture) None None None None 2023-12-11 19:11:55.833417+00:00 2
705 9YB5clqY cultured cell (cell culture) CL:0000010 (cell culture) None None None None 2023-12-11 19:11:55.833450+00:00 2
for record in cell_cultures:
    print(record.name)
    ontology_id = record.ontology_id.split(" ")[0]
    tissue_record = lb.CellType.from_bionty(ontology_id=ontology_id)
    if tissue_record._state.adding:
        tissue_record.save()
    for f in tissue_record.artifacts.all():
        f.labels.add(tissue_types.cell_culture, features.tissue_type)
T cell (cell culture)
respiratory basal cell (cell culture)
epithelial cell of lung (cell culture)
brown preadipocyte (cell culture)
endothelial cell (cell culture)
bronchial epithelial cell (cell culture)
epithelial cell of alveolus of lung (cell culture)
embryonic stem cell (cell culture)
mammary gland epithelial cell (cell culture)
trophoblast cell (cell culture)
preadipocyte (cell culture)
cultured cell (cell culture)
cell_cultures.delete()
(0, {})

Register collectionsΒΆ

for i, ulabel in enumerate(is_collection.children.all()):
    if i % 20 == 0:
        print(i)
    artifacts = ulabel.artifacts.all()
    if artifacts.count() == 0:
        continue
    if artifacts.count() == 1:
        artifacts = artifacts[0]
    collection = ln.Collection(
        artifacts,
        name=ulabel.name,
        description=ulabel.description,
        reference=ulabel.reference,
        reference_type="CELLxGENE Collection ID",
    )
    collection.save()