构建 STAR & RSEM 参考基因组
# STAR GENCODE
STAR --runThreadN 6 --runMode genomeGenerate --genomeDir STAR_gencode --genomeFastaFiles gencode_GRCh38.p14.genome.fa --sjdbGTFfile gencode_v46.chr_patch_hapl_scaff.annotation.gtf
# RSEM GENCODE
cd RSEM_gencode
rsem-prepare-reference --gtf ../gencode_v46.chr_patch_hapl_scaff.annotation.gtf -p 10 /data02/zhangmengmeng/database/hg38/genecode_GRCh38.p14.genome.fa /data02/zhangmengmeng/database/hg38/RSEM_gencodeDB/ref_RSEM_genecode_GRCh38.p14
STAR 定量
cd STAR_align
ls ../rawdata/*.R1.clean.fastq.gz|perl -ne 'chomp;my $a=$_;$a=~s/\_R1/\_R2/;if($_=~/\/([^\/]+)\.R1/){print "STAR --runThreadN 8 --quantMode TranscriptomeSAM GeneCounts --genomeDir /data02/zhangmengmeng/database/hg38/STAR_gencode --readFilesIn ../rawdata/$1.R1.clean.fastq.gz ../rawdata/$1.R2.clean.fastq.gz --outSAMtype BAM SortedByCoordinate --outFileNamePrefix $1. --readFilesCommand zcat && echo $1 align ok\n"}' >aln.sh
RSEM 比对
cd RSEM
ls ../STAR_align/*Aligned.sortedByCoord.out.bam|perl -ne 'chomp;my $a=$_;$a=~s/\_R1/\_R2/;if($_=~/\/([^\/]+)\.Aligned/){print "rsem-calculate-expression --paired-end -no-bam-output --alignments -p 8 --append-names ../STAR_align/$1.Aligned.toTranscriptome.out.bam /data02/zhangmengmeng/database/hg38/RSEM_gencode/ref_RSEM_gencode_GRCh38.p14 $1 && echo $1 align ok\n"}' > RSEM.sh
合并 RSEM 结果
# RSEM_MergeResults.py
import pandas as pd
import os
import glob
RSEM_FileNames = glob.glob("./*.genes.results")
for tmp_FileName in RSEM_FileNames:
tmp_df = pd.read_csv(tmp_FileName, sep="\t", header=0)
SampleName = tmp_FileName.split(".genes.results")[0].replace("./", "")
tmp_df["gene_id"] = tmp_df["gene_id"].str.split("_").str[1]
tmp_count = tmp_df.groupby('gene_id', as_index=False)['expected_count'].sum()
tmp_count.columns = ['Gene',SampleName]
tmp_TPM = tmp_df.groupby('gene_id', as_index=False)['TPM'].sum()
tmp_TPM.columns = ['Gene',SampleName]
tmp_FPKM = tmp_df.groupby('gene_id', as_index=False)['FPKM'].sum()
tmp_FPKM.columns = ['Gene',SampleName]
if 'final_count' in locals():
final_count = pd.merge(final_count,tmp_count,how = 'outer',on = "Gene")
else:
final_count = tmp_count
if 'final_TPM' in locals():
final_TPM = pd.merge(final_TPM,tmp_count,how = 'outer',on = "Gene")
else:
final_TPM = tmp_TPM
if 'final_FPKM' in locals():
final_FPKM = pd.merge(final_FPKM,tmp_FPKM,how = 'outer',on = "Gene")
else:
final_FPKM = tmp_FPKM
final_count.to_csv('RSEM_count.txt',sep='\t',index=False)
final_TPM.to_csv('RSEM_TPM.txt',sep='\t',index=False)
final_FPKM.to_csv('RSEM_FPKM.txt',sep='\t',index=False)