1. 前期处理
同somatic,处理成bqsr.bam
2. 按样本运行HaplotypeCaller生产gvcf文件
ls ../align/*_bqsr.bam | perl -ne 'chomp; if ($_ =~ m|/([^/]+)_bqsr|) { print "gatk HaplotypeCaller -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --native-pair-hmm-threads 10 --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -I ../align/$1_bqsr.bam --dbsnp /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/Homo_sapiens_assembly38.dbsnp138.vcf --minimum-mapping-quality 30 -ERC GVCF -O $1.g.raw.vcf && echo $1 HaplotypeCaller ok\n"; }' > HaplotypeCaller.sh
3. 运行GenomicsDBImport/CombineGVCFs合并所有样本的所有组织的gvcf文件
echo -n "gatk CombineGVCFs -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed " > CombineGVCFs_All.sh && for file in *.g.raw.vcf; do echo -n "--variant $file " >> CombineGVCFs_All.sh; done && echo "-O FETB_All.g.vcf && echo All CombineGVCFs ok" >> CombineGVCFs_All.sh
4. 运行GenotypeGVCFs来call germline mutation
gatk GenotypeGVCFs -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V FETB_All.g.vcf -O FETB_All.g.jointgenotyping.vcf && echo FETB_All GenotypeGVCFs ok
5. 对vcf文件运行第一次VariantRecalibrator+ApplyVQSR来获取SNP
gatk VariantRecalibrator -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V FETB_All.g.jointgenotyping.vcf --resource:hapmap,known=false,training=true,truth=true,prior=15.0 /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/resources_broad_hg38_v0_hapmap_3.3.hg38.vcf.gz --resource:omni,known=false,training=true,truth=false,prior=12.0 /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/resources_broad_hg38_v0_1000G_omni2.5.hg38.vcf.gz --resource:1000G,known=false,training=true,truth=false,prior=10.0 /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/Homo_sapiens_assembly38.dbsnp138.vcf -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode SNP -O FETB_All.snp.recal -tranches-file FETB_All.snp.tranches --rscript-file FETB_All.snp.plots.R && echo FETB_All SNP VariantRecalibrator ok
gatk ApplyVQSR -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V FETB_All.g.jointgenotyping.vcf -O FETB_All.g.snp.vcf --truth-sensitivity-filter-level 99.0 -tranches-file FETB_All.snp.tranches --recal-file FETB_All.snp.recal -mode SNP && echo FETB_All ApplyVQSR ok
6. 对上一步的vcf文件运行第二次VariantRecalibrator+ApplyVQSR来获取INDEL
gatk VariantRecalibrator -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V FETB_All.g.snp.vcf --resource:mills,known=true,training=true,truth=true,prior=12.0 /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode INDEL --max-gaussians 6 -O FETB_All.snp.indel.recal -tranches-file FETB_All.snp.indel.tranches --rscript-file FETB_All.snp.indel.plots.R && echo FETB_All INDEL VariantRecalibrator ok
gatk ApplyVQSR -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --intervals /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V FETB_All.g.snp.vcf -O FETB_All.snp.indel.g.vcf --truth-sensitivity-filter-level 99.0 -tranches-file FETB_All.snp.indel.tranches --recal-file FETB_All.snp.indel.recal -mode INDEL && echo FETB_All ApplyVQSR INDEL ok
7. 筛选Filter为PASS的突变
bcftools view -f PASS "FETB_All.snp.indel.g.vcf" > "FETB_All.snp.indel.g_PASS.vcf"
8. 使用Annovar对最终vcf进行注释
table_annovar.pl FETB_All.snp.indel.g_PASS.vcf /data02/zhangmengmeng/database/annovar_db/humandb_hg38 -buildver hg38 -out FETB_All.snp.indel.g -remove -protocol refGene,cytoBand,avsnp151,cosmic70,exac03,clinvar_20240611 -operation g,r,f,f,f,f -nastring . -vcfinput
9. 筛选在所有样本中都发生了突变的germline mutation
python filter_germline.py -i FETB_All.snp.indel.g.hg38_multianno.vcf -o FETB_All.snp.indel.g.hg38_multianno_germline_filtered.vcf
filter_germline.py
import argparse
import pysam
def filter_germline_by_gt(input_vcf, output_vcf):
"""
Filters a VCF file to retain only germline variants based on GT field.
A variant is considered germline if all samples have a genotype that is
NOT '0/0' (homozygous reference) or './.' (missing).
:param input_vcf: Path to input VCF file.
:param output_vcf: Path to output filtered VCF file.
"""
# Open the input VCF file
vcf_in = pysam.VariantFile(input_vcf)
# Create an output VCF with the same header
vcf_out = pysam.VariantFile(output_vcf, "w", header=vcf_in.header)
for record in vcf_in:
# Extract genotype (GT) information for all samples
genotypes = [sample["GT"] for sample in record.samples.values()]
# Check if all samples are not '0/0' or './.'
if all(gt not in [(0, 0), (None, None)] for gt in genotypes):
vcf_out.write(record)
# Close files
vcf_in.close()
vcf_out.close()
print(f"Filtered VCF saved to: {output_vcf}")
def main():
# Set up argument parser
parser = argparse.ArgumentParser(description="Filter VCF file to retain only germline variants.")
parser.add_argument('-i', '--input', required=True, help="Input VCF file")
parser.add_argument('-o', '--output', required=True, help="Output filtered VCF file")
# Parse arguments
args = parser.parse_args()
# Call the filter function
filter_germline_by_gt(args.input, args.output)
if __name__ == '__main__':
main()