ARTICLE AD BOX
I created per-chromosome zarr stores from a multisample VCF file using scikit-allel in a python program launched by a SLURM job array. I did this in order to parallelize the work so it would complete in less time and require less memory. Can I now merge the resulting 14 zarr stores into a single zarr store by simply renaming them, putting them inside an enclosing directory, and adding a .zgroup file to it? This was the original program:
#!/usr/bin/env python3 import sys; print(sys.version) import os import glob import subprocess import numpy as np; print('numpy', np.__version__) import pandas as pd; print('pandas',pd.__version__) import allel; print('allel', allel.__version__) import zarr; print('zarr', zarr.__version__) INFN = sys.argv[1] if not INFN: print('Must provide input .vcf.gz as first argument') sys.exit(2) FIELDS = [ 'samples', 'variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'variants/QUAL', 'variants/TYPE', 'variants/is_snp', 'variants/numalt', 'variants/AF', 'variants/DP', 'variants/ANN', 'calldata/DP', 'calldata/GT', ] EXCLUDE_FIELDS = None TABIX_EXEC = 'tabix' print("Using tabix executable '{}' {} '{}'\n{}".format(TABIX_EXEC, "->", subprocess.check_output(['which', 'tabix']).decode('utf-8').rstrip(), subprocess.check_output([TABIX_EXEC, '--version']).decode('utf-8'))) task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0)) chroms = subprocess.check_output([TABIX_EXEC,'-l',INFN], universal_newlines=True).strip().split('\n') ch = chroms[task_id] OUTFN = f"{INFN}.{ch}.zarr" transformers = None if 'ANN' in FIELDS: transformers=allel.ANNTransformer() def vcf_to_zarr_func(ch): allel.vcf_to_zarr(INFN, OUTFN, region=ch, group=ch, log=sys.stderr, fields=FIELDS, exclude_fields=EXCLUDE_FIELDS, tabix=TABIX_EXEC, transformers=transformers) print(f"Processing chromosome: {ch}") vcf_to_zarr_func(ch)