examples
Converting a .blocks.det file into a .hap file
You can use the data API to convert a PLINK 1.9 .blocks.det file into a .hap file.
As an example, let’s say we would like to convert the following simple.blocks.det file.
CHR BP1 BP2 KB NSNPS SNPS 1 10114 10117 2.001 2 1:10114:T:C|1:10116:A:G 1 10114 10119 2.007 2 1:10114:T:C|1:10117:C:A 1 10116 10119 2.011 2 1:10116:A:G|1:10117:C:A
from haptools import data
# load the genotypes file
# you can use either a VCF or PGEN file
gt = data.GenotypesVCF.load("tests/data/simple.vcf.gz")
gt = data.GenotypesPLINK.load("tests/data/simple.pgen")
# load the haplotypes
hp = data.Haplotypes("output.hap")
hp.data = {}
# iterate through lines of the .blocks.det file
with open("tests/data/simple.blocks.det") as blocks_file:
for idx, line in enumerate(blocks_file.read().splitlines()[1:]):
# initialize variables and parse line from the blocks file
hap_id = f"H{idx}"
chrom, bp1, bp2, kb, nsnps, snps = line.strip().split()
# create a haplotype line in the .hap file
hp.data[hap_id] = data.Haplotype(
chrom=chrom, start=int(bp1), end=int(bp2), id=hap_id
)
# fetch alleles from the genotypes file
snp_gts = gt.subset(variants=tuple(snps.split("|")))
# create variant lines for each haplotype
# Note that the .blocks.det file doesn't specify an allele, so
# we simply choose the first allele (ie the REF allele) for this example
hp.data[hap_id].variants = tuple(
data.Variant(
start=v["pos"],
end=v["pos"] + len(v["alleles"][0]),
id=v["id"],
allele=v["alleles"][0],
)
for v in snp_gts.variants
)
hp.write()
Converting a .snplist file into a .hap file
How would you convert a .snplist file into a .hap file suitable for use by simphenotype?
The basic idea is to encode each SNP as a haplotype containing only a single allele. For example, let’s say your .snplist file has two SNPs like this.
rs429358 0.73 rs7412 0.30
Then your .hap file might look something like this.
# orderH beta # version 0.2.0 #H beta .2f Effect size in linear model H 19 45411941 45411942 rs429358 0.73 H 19 45412079 45412080 rs7412 0.30 V rs429358 45411941 45411942 rs429358 C V rs7412 45412079 45412080 rs7412 T
You can use the data API and the simphenotype API to create such a file.
from haptools import data
from haptools.sim_phenotype import Haplotype
variants = {}
# load variants from the snplist file
with open("tests/data/apoe.snplist") as snplist_file:
for line in snplist_file.readlines():
# parse variant ID and beta from file
ID, beta = line.split("\t")
variants[ID] = float(beta)
# load the genotypes file
gt = data.GenotypesVCF("tests/data/apoe.vcf.gz")
gt.read(variants=variants.keys())
# initialize an empty haplotype file
hp = data.Haplotypes("output.hap", haplotype=Haplotype)
hp.data = {}
for variant in gt.variants:
ID, chrom, pos, alleles = variant[["id", "chrom", "pos", "alleles"]]
# we arbitrarily choose to use the ALT allele but alleles[0] will give you REF
allele = alleles[1]
end = pos + len(allele)
# create a haplotype line in the .hap file
hp.data[ID] = Haplotype(chrom=chrom, start=pos, end=end, id=ID, beta=variants[ID])
# create a variant line for each haplotype
hp.data[ID].variants = (data.Variant(start=pos, end=end, id=ID, allele=allele),)
hp.write()