安装
conda create -n biscut
conda activate biscut
# BISCUT 必须要用特定版本的R和Python
conda install conda-forge::r-base=4.1.2
conda install conda-forge::python=3.9.7
# 安装所需python包
pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install multiprocessing -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install operator -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install itertools -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install fnmatch -i https://pypi.tuna.tsinghua.edu.cn/simple/
# 安装所需R包
install.packages(c("ismev","extRemes","fitdistrplus","truncdist","segmented","dplyr","reticulate","foreach","doParallel","pastecs","ismev","ggplot2","fitdistrplus","gridExtra","stringr","gtable","cowplot","BiocManager"))
# 使用西湖大学镜像加速
options(BioC_mirror="https://mirrors.westlake.edu.cn/bioconductor")
BiocManager::install("GenomicRanges")
先运行BISCUT_preprocessing.py
# python内设置部分如下
# hg19和hg38都可以用SNP6_hg19_chromosome_locs_200605.txt作为位置文件来定义端粒和着丝粒的位置
# 注意seg文件需要放在python脚本下一个叫doc的文件夹内,其文件名格式为tumor_type+seg_file_suffix,如以下文件名为LiverOld_LIHC_merge_old_552.seg,注意tumor_type不能有下划线"_",否则后续会报错
amplitude_threshold = 0.2
chromosome_coordinates = '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
tumor_type = 'LiverOld'
seg_file_suffix = '_LIHC_merge_old_552.seg'
n_proc = 8
date_suffix = '2025_01_09'
构建hg38版本的基因名-cytoband对应文件
# 下载ucsc的hg38版本的cytoband文件:
# https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
# 下载ucsc的hg38版本的gtf文件:
# http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz
通过以下R脚本进行合并:
library(dplyr)
library(tidyr,lib.loc = "/usr/lib/R/site-library")
library(stringr,lib.loc = "/usr/lib/R/site-library")
library(IRanges)
UCSC_cytoband_file <- "~/database/GRCh38/ucsc/UCSC_cytoBand.txt"
UCSC_refgene_gtf <- "~/database/GRCh38/ucsc/UCSC_hg38.refGene.gtf"
UCSC_cytoband <- read.table(UCSC_cytoband_file, header = F, sep = '\t',quote = "")
UCSC_refgene <- read.table(UCSC_refgene_gtf, header = F, sep = '\t',quote = "",comment.char = "#")
# process refgene gtf
UCSC_refgene_split <- UCSC_refgene %>%
separate(V9, into = paste0("V9_", 1:5), sep = ";", fill = "right", extra = "drop")
UCSC_refgene_split <- subset(UCSC_refgene_split,V3=="transcript")[,c(1,4,5,9,10)]
colnames(UCSC_refgene_split) <- c("Chr","Start","End","Gene","RefSeqName")
UCSC_refgene_split <- UCSC_refgene_split %>%
mutate(Gene = gsub('.*"([^"]+)".*', '\\1', Gene))
UCSC_refgene_split <- UCSC_refgene_split %>%
mutate(RefSeqName = gsub('.*"([^"]+)".*', '\\1', RefSeqName))
UCSC_refgene_split <- subset(UCSC_refgene_split,Chr %in% c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"))
# process cytoband
UCSC_cytoband <- subset(UCSC_cytoband,V1 %in% c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"))
UCSC_cytoband <- UCSC_cytoband[,1:4]
colnames(UCSC_cytoband) <- c("Chr","Start","End","Cytoband")
# Add the Cytoband annotation to UCSC_refgene_split
UCSC_refgene_split$Cytoband <- NA
# Loop through each chromosome to match positions
for (chr in unique(UCSC_refgene_split$Chr)) {
# Filter rows for the current chromosome
refgene_chr <- UCSC_refgene_split %>% filter(Chr == chr)
cytoband_chr <- UCSC_cytoband %>% filter(Chr == chr)
# Create IRanges objects for the current chromosome
refgene_ranges <- IRanges(start = refgene_chr$Start, end = refgene_chr$End)
cytoband_ranges <- IRanges(start = cytoband_chr$Start, end = cytoband_chr$End)
# Find overlaps
overlaps <- findOverlaps(refgene_ranges, cytoband_ranges)
# Map Cytoband values back to UCSC_refgene_split
UCSC_refgene_split$Cytoband[UCSC_refgene_split$Chr == chr][queryHits(overlaps)] <-
cytoband_chr$Cytoband[subjectHits(overlaps)]
}
UCSC_refgene_split <- UCSC_refgene_split[,c("Chr","Start","End","Cytoband","Gene","RefSeqName")]
write.table(UCSC_refgene_split,"/home/zhoukaiwen/software/BISCUT-py3-main/docs/UCSC_hg38_geneloc.txt",col.names = T,row.names = F,sep = '\t',quote = F)
- 此后按需要可能要删除文件中Chr列的“chr”前缀,并提取unique的基因转录本
设定默认shell
在home目录首先构建tmp文件夹
mkdir /home/zhoukaiwen/tmp
在tmp文件夹中构建软链接,否则后续reticulate包可能会报错,因默认的/usr/bin/sh默认指向dash,与bash的语言格式不同
ln -s /bin/bash /home/zhoukaiwen/tmp/sh
修改R脚本
# 修改最后的files = list.files(path = paste(resultsfolder,'/stats',sep=''))为以下,否则会读取result文件夹下的summary文件夹名从而报错
files = list.files(path = paste(resultsfolder,'/stats',sep=''),pattern = "\\.rds$")
# 在BISCUT_peaks_finding.R 最后的source_python前添加以下代码来绕开R包reticulate中的Sys.which("sh")链接到“/usr/bin/sh”
Sys.setenv(PATH = paste("/home/zhoukaiwen/tmp", Sys.getenv("PATH"), sep = ":"))
Sys.which("sh")
BISCUT结果文件
BISCUT 结果文件有:
- 在设定的results文件夹下的all_BISCUT_results.txt,如果有多个肿瘤类型,这里是整合了所有类型的
- 在results文件夹下有以设定的肿瘤类型命名的文件夹,如上面设置的LiverOld,这个文件夹下又有一个summary文件夹,里面的LiverOld_BISCUT_results.txt和LiverOld_BISCUT_results_cols_0.95.txt为这个肿瘤类型单独的结果文件
通过BISCUT的结果文件计算peak和gene-level的Relative fitness
gc()
rm(list=ls())
# Settings ####
n_cores <- 8 # numbers of CPU cores for parallelization over chromosome arms
set.seed(123456789) #random seed
# Settings for Liver Old —— Self background
tumor_type <- "LiverOld"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverOld/summary/LiverOld_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverOld/summary/LiverOld_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_09/LiverOld"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95"
# Settings for Liver Young —— Self background
tumor_type <- "LiverYoung"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverYoung/summary/LiverYoung_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverYoung/summary/LiverYoung_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_09/LiverYoung"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95"
# Settings for Liver Old —— PANCAN background
tumor_type <- "LiverOld"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverOld/summary/LiverOld_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverOld/summary/LiverOld_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_14/LiverOld"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95"
# Settings for Liver Young —— PANCAN background
tumor_type <- "LiverYoung"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverYoung/summary/LiverYoung_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverYoung/summary/LiverYoung_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_14/LiverYoung"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95"
# Step0: Load Library ####
library(ismev)
library(extRemes)
library(fitdistrplus)
library(truncdist)
library(segmented)
library(dplyr)
library(reticulate)
library(foreach)
library(doParallel)
library(tidyr,lib.loc = "/usr/lib/R/site-library")
registerDoParallel(n_cores)
getDoParWorkers()
# Step0: Load telomere/centromere location files ####
message("Step0: Loading telomere/centromere location file...")
abslocs <- read.table(abslocs_file,sep='\t',header=T)
message("Step0: Load Done!")
# Step1: Process BISCUT identified peak file and obtain peak regions ####
message("Step1: Loading BISCUT identified peak files...")
biscut_peak <- read.csv(biscut_peak_file,sep='\t',header=T)
biscut_gene <- read.csv(biscut_gene_file,sep='\t',header=T)
message("Step1: Extracting BISCUT identified peaks...")
biscut_peak <- as.data.frame(t(biscut_peak))
colnames(biscut_peak) <- biscut_peak[1,]
biscut_peak <- biscut_peak[-1,]
biscut_peak_loc <- biscut_peak$peak_location
biscut_peak_loc <- data.frame(biscut_peak_loc) %>%
separate(biscut_peak_loc, into = c("Chr", "Positions"), sep = ":") %>%
separate(Positions, into = c("Start", "End"), sep = "-")
biscut_peak_loc$Chr <- gsub("chr","",biscut_peak_loc$Chr)
biscut_peak_loc$Chr <- as.integer(biscut_peak_loc$Chr)
biscut_peak_loc$direction <- biscut_peak$direction
biscut_peak_loc$telcent <- biscut_peak$`telomeric or centromeric`
biscut_peak_loc_abslocs <- biscut_peak_loc %>%
left_join(abslocs, by = c("Chr" = "chromosome_info"))
biscut_peak_loc_abslocs$Chr <- as.character(biscut_peak_loc_abslocs$Chr)
biscut_peak_loc_abslocs$ArmType <- "NA"
biscut_peak_loc_abslocs$Start <- as.integer(biscut_peak_loc_abslocs$Start)
biscut_peak_loc_abslocs$End <- as.integer(biscut_peak_loc_abslocs$End)
biscut_peak_loc_abslocs$ArmType[which(biscut_peak_loc_abslocs$Start > biscut_peak_loc_abslocs$p_start)] <- "p"
biscut_peak_loc_abslocs$ArmType[which(biscut_peak_loc_abslocs$Start > biscut_peak_loc_abslocs$q_start)] <- "q"
biscut_peak_loc_abslocs$ArmLength <- "NA"
biscut_peak_loc_abslocs$ArmLength[which(biscut_peak_loc_abslocs$ArmType=="p")] <- biscut_peak_loc_abslocs$p_end[which(biscut_peak_loc_abslocs$ArmType=="p")]-biscut_peak_loc_abslocs$p_start[which(biscut_peak_loc_abslocs$ArmType=="p")]
biscut_peak_loc_abslocs$ArmLength[which(biscut_peak_loc_abslocs$ArmType=="q")] <- biscut_peak_loc_abslocs$q_end[which(biscut_peak_loc_abslocs$ArmType=="q")]-biscut_peak_loc_abslocs$q_start[which(biscut_peak_loc_abslocs$ArmType=="q")]
biscut_peak_loc_abslocs$ArmLength <- as.integer(biscut_peak_loc_abslocs$ArmLength)
biscut_peak_loc_abslocs$ArmStart<- "NA"
biscut_peak_loc_abslocs$ArmStart[which(biscut_peak_loc_abslocs$ArmType=="p")] <- biscut_peak_loc_abslocs$p_start[which(biscut_peak_loc_abslocs$ArmType=="p")]
biscut_peak_loc_abslocs$ArmStart[which(biscut_peak_loc_abslocs$ArmType=="q")] <- biscut_peak_loc_abslocs$q_start[which(biscut_peak_loc_abslocs$ArmType=="q")]
biscut_peak_loc_abslocs$ArmStart <- as.integer(biscut_peak_loc_abslocs$ArmStart)
biscut_peak_loc_abslocs$Start_percent <- "NA"
biscut_peak_loc_abslocs$Start_percent <- (biscut_peak_loc_abslocs$Start-biscut_peak_loc_abslocs$ArmStart)/biscut_peak_loc_abslocs$ArmLength
biscut_peak_loc_abslocs$Start_percent[which(biscut_peak_loc_abslocs$Start_percent<0)] <- 0
biscut_peak_loc_abslocs$Start_percent[which(biscut_peak_loc_abslocs$Start_percent>1)] <- 1
biscut_peak_loc_abslocs$End_percent <- "NA"
biscut_peak_loc_abslocs$End_percent <- (biscut_peak_loc_abslocs$End-biscut_peak_loc_abslocs$ArmStart)/biscut_peak_loc_abslocs$ArmLength
biscut_peak_loc_abslocs$End_percent[which(biscut_peak_loc_abslocs$End_percent<0)] <- 0
biscut_peak_loc_abslocs$End_percent[which(biscut_peak_loc_abslocs$End_percent>1)] <- 1
biscut_peak_loc_abslocs$Peak_Length <- biscut_peak_loc_abslocs$End-biscut_peak_loc_abslocs$Start
biscut_peak_loc_abslocs$Peak_Length_percent <- biscut_peak_loc_abslocs$Peak_Length/biscut_peak_loc_abslocs$ArmLength
# Reverse the direction peaks of cent-bounded on p and tel-bounded on q to make them start on 0
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent)
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent)
tmp_peak_start <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent
tmp_peak_end <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent <- tmp_peak_start
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent <- tmp_peak_end
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent)
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent)
tmp_peak_start <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent
tmp_peak_end <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent <- tmp_peak_start
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent <- tmp_peak_end
biscut_peak_loc_abslocs$Chr <- as.integer(biscut_peak_loc_abslocs$Chr)
biscut_peak_loc_abslocs <- biscut_peak_loc_abslocs[order(biscut_peak_loc_abslocs$Chr),]
message("Step1: Extracting BISCUT identified genes on peaks...")
biscut_gene <- biscut_gene[,c("Chr","Gene","Start","End","Peak.Start","Peak.End","arm","direction","telcent")]
biscut_gene_loc_abslocs <- biscut_gene %>%
left_join(abslocs, by = c("Chr" = "chromosome_info"))
biscut_gene_loc_abslocs$ArmType <- "NA"
biscut_gene_loc_abslocs$Start <- as.integer(biscut_gene_loc_abslocs$Start)
biscut_gene_loc_abslocs$End <- as.integer(biscut_gene_loc_abslocs$End)
biscut_gene_loc_abslocs$ArmType[which(biscut_gene_loc_abslocs$Start > biscut_gene_loc_abslocs$p_start)] <- "p"
biscut_gene_loc_abslocs$ArmType[which(biscut_gene_loc_abslocs$Start > biscut_gene_loc_abslocs$q_start)] <- "q"
biscut_gene_loc_abslocs$ArmLength <- "NA"
biscut_gene_loc_abslocs$ArmLength[which(biscut_gene_loc_abslocs$ArmType=="p")] <- biscut_gene_loc_abslocs$p_end[which(biscut_gene_loc_abslocs$ArmType=="p")]-biscut_gene_loc_abslocs$p_start[which(biscut_gene_loc_abslocs$ArmType=="p")]
biscut_gene_loc_abslocs$ArmLength[which(biscut_gene_loc_abslocs$ArmType=="q")] <- biscut_gene_loc_abslocs$q_end[which(biscut_gene_loc_abslocs$ArmType=="q")]-biscut_gene_loc_abslocs$q_start[which(biscut_gene_loc_abslocs$ArmType=="q")]
biscut_gene_loc_abslocs$ArmLength <- as.integer(biscut_gene_loc_abslocs$ArmLength)
biscut_gene_loc_abslocs$ArmStart<- "NA"
biscut_gene_loc_abslocs$ArmStart[which(biscut_gene_loc_abslocs$ArmType=="p")] <- biscut_gene_loc_abslocs$p_start[which(biscut_gene_loc_abslocs$ArmType=="p")]
biscut_gene_loc_abslocs$ArmStart[which(biscut_gene_loc_abslocs$ArmType=="q")] <- biscut_gene_loc_abslocs$q_start[which(biscut_gene_loc_abslocs$ArmType=="q")]
biscut_gene_loc_abslocs$ArmStart <- as.integer(biscut_gene_loc_abslocs$ArmStart)
biscut_gene_loc_abslocs$Start_percent <- "NA"
biscut_gene_loc_abslocs$Start_percent <- (biscut_gene_loc_abslocs$Start-biscut_gene_loc_abslocs$ArmStart)/biscut_gene_loc_abslocs$ArmLength
biscut_gene_loc_abslocs$Start_percent[which(biscut_gene_loc_abslocs$Start_percent<0)] <- 0
biscut_gene_loc_abslocs$Start_percent[which(biscut_gene_loc_abslocs$Start_percent>1)] <- 1
biscut_gene_loc_abslocs$End_percent <- "NA"
biscut_gene_loc_abslocs$End_percent <- (biscut_gene_loc_abslocs$End-biscut_gene_loc_abslocs$ArmStart)/biscut_gene_loc_abslocs$ArmLength
biscut_gene_loc_abslocs$End_percent[which(biscut_gene_loc_abslocs$End_percent<0)] <- 0
biscut_gene_loc_abslocs$End_percent[which(biscut_gene_loc_abslocs$End_percent>1)] <- 1
biscut_gene_loc_abslocs$Gene_Length <- biscut_gene_loc_abslocs$End-biscut_gene_loc_abslocs$Start
biscut_gene_loc_abslocs$Gene_Length_percent <- biscut_gene_loc_abslocs$Gene_Length/biscut_gene_loc_abslocs$ArmLength
# Reverse the direction genes of cent-bounded on p and tel-bounded on q to make them start on 0
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent)
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent)
tmp_gene_start <- biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent
tmp_gene_end <-biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent <- tmp_gene_start
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent <- tmp_gene_end
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent)
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent)
tmp_gene_start <- biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent
tmp_gene_end <-biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent <- tmp_gene_start
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent <- tmp_gene_end
biscut_gene_loc_abslocs$Chr <- as.integer(biscut_gene_loc_abslocs$Chr)
biscut_gene_loc_abslocs <- biscut_gene_loc_abslocs[order(biscut_gene_loc_abslocs$Chr),]
message("Step1 All Done!")
# Step2: Process breakpoint file and count the number of true telomere/centromere-bounded SCNA breakpoints between peaks ####
message("Step2: Extracting breakpoint peak file...")
# Extract the 4 rows necessary for later calculation
biscut_peak_loc_abslocs_unique <- unique(biscut_peak_loc_abslocs[, c("Chr", "direction", "telcent", "ArmType")])
# Calculate the max number of peaks within one chr arm
dup_count <- biscut_peak_loc_abslocs %>%
group_by(Chr, direction, telcent, ArmType) %>%
summarise(count = n(), .groups = "drop") %>%
as.data.frame(.)
# Create breakpoint_filenames to store paths of telomere/centromere-bounded SCNA breakpoint files
breakpoint_filenames <- unique(apply(biscut_peak_loc_abslocs_unique, 1, function(row) {
paste(breakpoint_file_path,"/",tumor_type,"_",row["Chr"], row["ArmType"], "_", row["direction"], "_",row["telcent"], ".txt",sep = "")
}))
# Change the name for chr 13,14,15,21,22 to match the names of breakpoint filenames
breakpoint_filenames <- gsub("_13p","_13",breakpoint_filenames)
breakpoint_filenames <- gsub("_13q","_13",breakpoint_filenames)
breakpoint_filenames <- gsub("_14p","_14",breakpoint_filenames)
breakpoint_filenames <- gsub("_14q","_14",breakpoint_filenames)
breakpoint_filenames <- gsub("_15p","_15",breakpoint_filenames)
breakpoint_filenames <- gsub("_15q","_15",breakpoint_filenames)
breakpoint_filenames <- gsub("_21p","_21",breakpoint_filenames)
breakpoint_filenames <- gsub("_21q","_21",breakpoint_filenames)
breakpoint_filenames <- gsub("_22p","_22",breakpoint_filenames)
breakpoint_filenames <- gsub("_22q","_22",breakpoint_filenames)
breakpoint_filenames <- gsub(" ","",breakpoint_filenames)
# Calculate the max number of segments(between BISCUT peaks) in one arm
n = max(dup_count$count)+1
# Create column names for Expected and True breakpoint count within one segment
col_names <- paste0("E_", 0:(n-1), "_", 1:n)
new_rows_Expected <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_Expected) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_Expected)
col_names <- paste0("T_", 0:(n-1), "_", 1:n)
new_rows_True <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_True) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_True)
# Set the max number of peak in one chr arm
biscut_peak_n <- max(dup_count$count)
# Set names for peak start
col_names <- paste0("biscut_peak_start_", 1:biscut_peak_n)
new_rows_biscut_ps <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_ps) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_ps)
# Set names for peak end
col_names <- paste0("biscut_peak_end_", 1:biscut_peak_n)
new_rows_biscut_pe <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_pe) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_pe)
# Set names for peak length as percentage
col_names <- paste0("biscut_peak_length_percent_", 1:biscut_peak_n)
new_rows_biscut_plp <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_plp) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_plp)
# Read breakpoint_filenames
message("Matching breakpoint files with BISCUT peak files...")
count_n = 1
for (filenames in breakpoint_filenames){
tmp_file <- read.csv(filenames,sep='\t',header=T)
tmp_file <- subset(tmp_file,percent>0) # Extract samples with SCNA length in the chr arm
tmp_count_df <- as.data.frame(table(tmp_file$end)) # Extract SCNA breakpoint end as percentage and count its number in the chr arm
tmp_count_df$Var1 <- as.numeric(as.character(tmp_count_df$Var1)) # Var1 correspond to the breakpoint end location in the chr arm, shown as percentage
tmp_count_df$Freq <- as.numeric(as.character(tmp_count_df$Freq)) # Freq correspond to the count of the same breakpoints in the chr arm, shown as integer
message("Processing ",biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
# Extract BISCUT indentified peak(peaks that located at the chr arm in breakpoint_filenames) locations(as percentage)
tmp_biscut_peak_start <- subset(biscut_peak_loc_abslocs, Chr==biscut_peak_loc_abslocs_unique[count_n,]$Chr & direction==biscut_peak_loc_abslocs_unique[count_n,]$direction & telcent==biscut_peak_loc_abslocs_unique[count_n,]$telcent & ArmType==biscut_peak_loc_abslocs_unique[count_n,]$ArmType)$Start_percent
tmp_biscut_peak_end <- subset(biscut_peak_loc_abslocs, Chr==biscut_peak_loc_abslocs_unique[count_n,]$Chr & direction==biscut_peak_loc_abslocs_unique[count_n,]$direction & telcent==biscut_peak_loc_abslocs_unique[count_n,]$telcent & ArmType==biscut_peak_loc_abslocs_unique[count_n,]$ArmType)$End_percent
tmp_biscut_peak_length_percent <- subset(biscut_peak_loc_abslocs, Chr==biscut_peak_loc_abslocs_unique[count_n,]$Chr & direction==biscut_peak_loc_abslocs_unique[count_n,]$direction & telcent==biscut_peak_loc_abslocs_unique[count_n,]$telcent & ArmType==biscut_peak_loc_abslocs_unique[count_n,]$ArmType)$Peak_Length_percent
message("Filtering telomere/centromere-bounded SCNA breakpoints before BISCUT peak1 start: ",min(tmp_biscut_peak_start))
tmp_count_df_subset1 <- subset(tmp_count_df, Var1 < min(tmp_biscut_peak_start)) # SCNA breakpoint end < BISCUT peak start are identified as E_0_1
tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < 1 & Var1 > max(tmp_biscut_peak_end)) # Extract the SCNA breakpoint end in the last segment(located before arm end(1) and after the last BISCUT end(max(tmp_biscut_peak_end)))
tmp_E_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of Expected breakpoints in the first segment, which equals True total count of breakpoints
tmp_T_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of True breakpoints in the first segment
tmp_T_1_2 <- sum(tmp_count_df_subset2$Freq) # Calculate the total count of True breakpoints in the second segment, will change later if the number of BISCUT peak is larger than 1
biscut_peak_loc_abslocs_unique[count_n,]$E_0_1 <- tmp_E_0_1
biscut_peak_loc_abslocs_unique[count_n,]$T_0_1 <- tmp_T_0_1
if (length(tmp_biscut_peak_start)==1){
message("There's only 1 peak on ", biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
message("Filtering telomere/centromere-bounded SCNA breakpoints before arm end")
biscut_peak_loc_abslocs_unique[count_n,]$biscut_peak_start_1 <- tmp_biscut_peak_start
biscut_peak_loc_abslocs_unique[count_n,]$biscut_peak_end_1 <- tmp_biscut_peak_end
biscut_peak_loc_abslocs_unique[count_n,]$biscut_peak_length_percent_1 <- tmp_biscut_peak_length_percent
tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < 1 & Var1 > tmp_biscut_peak_end) # SCNA breakpoint end < BISCUT peak start are identified as E_0_1
biscut_peak_loc_abslocs_unique[count_n,]$T_1_2 <- tmp_T_1_2
}else if(length(tmp_biscut_peak_start)>1){
message("There're ",length(tmp_biscut_peak_start)," peaks on ", biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
for(peak_n in 1:length(tmp_biscut_peak_start)){
message("Processing peak ", peak_n, "...")
bps_col_name <- paste0("biscut_peak_start_", peak_n)
bpe_col_name <- paste0("biscut_peak_end_", peak_n)
bplp_col_name <- paste0("biscut_peak_length_percent_", peak_n)
true_col_name <- paste0("T_", peak_n-1, "_", peak_n)
biscut_peak_loc_abslocs_unique[count_n,bps_col_name] <- tmp_biscut_peak_start[peak_n]
biscut_peak_loc_abslocs_unique[count_n,bpe_col_name] <- tmp_biscut_peak_end[peak_n]
biscut_peak_loc_abslocs_unique[count_n,bplp_col_name] <- tmp_biscut_peak_length_percent[peak_n]
if (1 < peak_n & peak_n < length(tmp_biscut_peak_start)){
tmp_count_df_subset <- subset(tmp_count_df, Var1 < tmp_biscut_peak_start[peak_n] & Var1 > tmp_biscut_peak_end[peak_n-1])
biscut_peak_loc_abslocs_unique[count_n,true_col_name] <- sum(tmp_count_df_subset$Freq)
}else if(peak_n==length(tmp_biscut_peak_start)){
tmp_count_df_subset1 <- subset(tmp_count_df, Var1 < tmp_biscut_peak_start[peak_n] & Var1 > tmp_biscut_peak_end[peak_n-1])
tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < 1 & Var1 > tmp_biscut_peak_end[peak_n])
true_col_name2 <- paste0("T_", peak_n, "_", peak_n+1)
biscut_peak_loc_abslocs_unique[count_n,true_col_name] <- sum(tmp_count_df_subset1$Freq)
biscut_peak_loc_abslocs_unique[count_n,true_col_name2] <- sum(tmp_count_df_subset2$Freq)
}
}
}
count_n = count_n+1
}
message("Matching BISCUT peak files with BISCUT gene files...")
dup_count$MultiPeak <- "TRUE"
dup_count$MultiPeak[which(dup_count$count == 1)] <- "FALSE"
dup_peak <- unique(biscut_peak_loc_abslocs[, c("Chr", "direction", "telcent", "ArmType","Start","End","Start_percent","End_percent")])
dup_peak <- dup_peak[order(dup_peak$Start),]
dup_peak <- dup_peak %>%
group_by(Chr, direction, telcent, ArmType) %>%
mutate(
PeakNum = paste0("Peak", row_number()), # Assign peak numbers
PeakNum = ifelse(row_number() == n(), # Check if it's the last row in the group
paste0(PeakNum, "(Last)"),
PeakNum),
NextPeakStart = ifelse(grepl("\\(Last\\)", PeakNum), 1, lead(Start_percent)), # Use lead(Start_percent) for next peak, 1 if Last
PriorPeakEnd = ifelse(row_number() == 1, 0, lag(End_percent))
) %>%
ungroup()
colnames(dup_peak)[which(colnames(dup_peak)=="Start")] <- "Peak.Start"
colnames(dup_peak)[which(colnames(dup_peak)=="End")] <- "Peak.End"
colnames(dup_peak)[which(colnames(dup_peak)=="Start_percent")] <- "Peak.Start_percent"
colnames(dup_peak)[which(colnames(dup_peak)=="End_percent")] <- "Peak.End_percent"
biscut_gene_loc_abslocs_peak <- biscut_gene_loc_abslocs %>%
left_join(dup_peak, by = c("Chr" = "Chr","direction" = "direction", "telcent" = "telcent", "ArmType" = "ArmType","Peak.Start"="Peak.Start","Peak.End"="Peak.End"))
message("Matching breakpoint files with BISCUT gene files")
biscut_gene_loc_abslocs_peak$E_0_1 <- NA
biscut_gene_loc_abslocs_peak$E_1_2 <- NA
biscut_gene_loc_abslocs_peak$T_0_1 <- NA
biscut_gene_loc_abslocs_peak$T_1_2 <- NA
count_n=1
for (filenames in breakpoint_filenames){
tmp_file <- read.csv(filenames,sep='\t',header=T)
tmp_file <- subset(tmp_file,percent>0) # Extract samples with SCNA length in the chr arm
tmp_count_df <- as.data.frame(table(tmp_file$end)) # Extract SCNA breakpoint end as percentage and count its number in the chr arm
tmp_count_df$Var1 <- as.numeric(as.character(tmp_count_df$Var1)) # Var1 correspond to the breakpoint end location in the chr arm, shown as percentage
tmp_count_df$Freq <- as.numeric(as.character(tmp_count_df$Freq)) # Freq correspond to the count of the same breakpoints in the chr arm, shown as integer
tmp_Chr <- biscut_peak_loc_abslocs_unique[count_n,]$Chr
tmp_ArmType <- biscut_peak_loc_abslocs_unique[count_n,]$ArmType
tmp_telcent <- biscut_peak_loc_abslocs_unique[count_n,]$telcent
tmp_direction <- biscut_peak_loc_abslocs_unique[count_n,]$direction
message("Processing ",biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
# Matching BISCUT indentified genes
for (gn in 1:nrow(biscut_gene_loc_abslocs_peak)){
if(biscut_gene_loc_abslocs_peak[gn,]$Chr==tmp_Chr & biscut_gene_loc_abslocs_peak[gn,]$ArmType==tmp_ArmType & biscut_gene_loc_abslocs_peak[gn,]$telcent==tmp_telcent & biscut_gene_loc_abslocs_peak[gn,]$direction==tmp_direction){
message(filenames)
tmp_biscut_gene_start <- biscut_gene_loc_abslocs_peak[gn,]$Start_percent
tmp_biscut_gene_end <- biscut_gene_loc_abslocs_peak[gn,]$End_percent
tmp_biscut_gene_length_percent <- biscut_gene_loc_abslocs_peak[gn,]$Gene_Length_percent
tmp_prior_peak_end_percent <- biscut_gene_loc_abslocs_peak[gn,]$PriorPeakEnd
tmp_next_peak_start_percent <- biscut_gene_loc_abslocs_peak[gn,]$NextPeakStart
tmp_gene_name <- biscut_gene_loc_abslocs_peak[gn,]$Gene
message("Filtering telomere/centromere-bounded SCNA breakpoints before ", tmp_gene_name," start: ",tmp_biscut_gene_start)
tmp_count_df_subset1 <- subset(tmp_count_df, Var1 < tmp_biscut_gene_start & Var1 > tmp_prior_peak_end_percent) # SCNA breakpoint end < BISCUT gene start are identified as E_0_1
tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < tmp_next_peak_start_percent & Var1 > tmp_biscut_gene_end) # Extract the SCNA breakpoint end in the segment after the BISCUT gene end
tmp_E_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of Expected breakpoints in the first segment, which equals True total count of breakpoints
tmp_T_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of True breakpoints in the first segment
tmp_T_1_2 <- sum(tmp_count_df_subset2$Freq) # Calculate the total count of True breakpoints in the second segment
biscut_gene_loc_abslocs_peak[gn,]$E_0_1 <- tmp_E_0_1
biscut_gene_loc_abslocs_peak[gn,]$T_0_1 <- tmp_T_0_1
biscut_gene_loc_abslocs_peak[gn,]$T_1_2 <- tmp_T_1_2
message("Gene ",gn," ",tmp_gene_name,": ",tmp_E_0_1,"|",tmp_T_0_1,"|",tmp_T_1_2)
}
}
count_n = count_n+1
}
message("Step2 All Done!")
# Step3: Fit breakpoints to beta distribution to obtain alpha and beta value ####
biscut_peak_loc_abslocs_unique$alpha <- NA
biscut_peak_loc_abslocs_unique$beta <- NA
count_n = 1
for (filenames in breakpoint_filenames){
tmp_file <- read.csv(filenames,sep='\t',header=T)
betafit <- fitdist(tmp_file$percent,'beta')
alpha = summary(betafit)$estimate[1]
beta = summary(betafit)$estimate[2]
biscut_peak_loc_abslocs_unique[count_n,"alpha"] <- alpha
biscut_peak_loc_abslocs_unique[count_n,"beta"] <- beta
count_n = count_n+1
}
arm_alpha_beta <- biscut_peak_loc_abslocs_unique[,c("Chr","direction","telcent","ArmType","alpha","beta")]
biscut_gene_loc_abslocs_peak <- biscut_gene_loc_abslocs_peak %>%
left_join(arm_alpha_beta, by = c("Chr" = "Chr","direction" = "direction", "telcent" = "telcent", "ArmType" = "ArmType"))
message("Step3 All Done!")
# Step4: Use BISCUT peak to estimate expected count of telomere/centromere-bounded SCNA breakpoints between peaks ####
message("Calculating peak-level expected count of telomere/centromere-bounded SCNA breakpoints")
for(n in 1:nrow(biscut_peak_loc_abslocs_unique)){
for(peak_n in 1:max(dup_count$count)){
expected_n_1_col_name <- paste0("E_", peak_n-1, "_", peak_n)
expected_n_col_name <- paste0("E_", peak_n, "_", peak_n+1)
bps_n_col_name <- paste0("biscut_peak_start_", peak_n)
bpe_n_col_name <- paste0("biscut_peak_end_", peak_n)
bplp_n_col_name <- paste0("biscut_peak_length_percent_", peak_n)
bps_n1_col_name <- paste0("biscut_peak_start_", peak_n+1)
bpe_n1_col_name <- paste0("biscut_peak_end_", peak_n+1)
bplp_n1_col_name <- paste0("biscut_peak_length_percent_", peak_n+1)
bps_n_1_col_name <- paste0("biscut_peak_start_", peak_n-1)
bpe_n_1_col_name <- paste0("biscut_peak_end_", peak_n-1)
bplp_n_1_col_name <- paste0("biscut_peak_length_percent_", peak_n-1)
if (peak_n == 1){
if(bps_n1_col_name %in% colnames(biscut_peak_loc_abslocs_unique)){
if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==TRUE){
biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
}else if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==FALSE){
biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
}
}else if(((bps_n1_col_name %in% colnames(biscut_peak_loc_abslocs_unique)==FALSE))){
biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
}
}else if(1<peak_n & peak_n<max(dup_count$count)){
if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n_col_name])==FALSE & is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==FALSE){
biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
}else if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n_col_name])==FALSE & is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==TRUE){
biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
}
}else if(peak_n==max(dup_count$count)){
if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n_col_name])==FALSE){
biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
}
}
}
}
message("Calculating gene-level expected count of telomere/centromere-bounded SCNA breakpoints")
for (n in 1:nrow(biscut_gene_loc_abslocs_peak)){
biscut_gene_loc_abslocs_peak[n,"E_1_2"] <- (pbeta(biscut_gene_loc_abslocs_peak[n,"NextPeakStart"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"])-pbeta(biscut_gene_loc_abslocs_peak[n,"End_percent"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"]))*biscut_gene_loc_abslocs_peak[n,"E_0_1"]/(pbeta(biscut_gene_loc_abslocs_peak[n,"Start_percent"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"])-pbeta(biscut_gene_loc_abslocs_peak[n,"PriorPeakEnd"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"]))
}
message("Step4 All Done!")
# Step5: Divide Expected count of telomere/centromere-bounded SCNA breakpoints with True telomere/centromere-bounded SCNA breakpoints to infer fitness ####
col_names <- paste0("biscut_peak", 1:biscut_peak_n,"_RF")
new_rows_biscut_prf <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_prf) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_prf)
# Calculate Peak-level Relative Fitness as (True breakpoint count)/(Expected breakpoint count)
for(n in 1:nrow(biscut_peak_loc_abslocs_unique)){
for(peak_n in 1:max(dup_count$count)){
expected_n_col_name <- paste0("E_", peak_n, "_", peak_n+1)
true_n_col_name <- paste0("T_", peak_n, "_", peak_n+1)
RF_n_col_name <- paste0("biscut_peak", peak_n, "_RF")
biscut_peak_loc_abslocs_unique[n,RF_n_col_name] <- (biscut_peak_loc_abslocs_unique[n,true_n_col_name]+0.000000001)/(biscut_peak_loc_abslocs_unique[n,expected_n_col_name]+0.000000001)
}
}
# Calculate Gene-level Relative Fitness as (True breakpoint count)/(Expected breakpoint count)
biscut_gene_loc_abslocs_peak$RF <- (biscut_gene_loc_abslocs_peak$T_1_2+0.000000001)/(biscut_gene_loc_abslocs_peak$E_1_2+0.000000001)
message("Step5 All Done!")
message("Writting Files to ",output_path,"/",tumor_type)
write.table(biscut_peak_loc_abslocs_unique,paste0(output_path,"/",tumor_type,"_biscut_PANCAN_peak_RF.txt"),col.names = T, row.names = F, sep = '\t', quote = F)
write.table(biscut_gene_loc_abslocs_peak,paste0(output_path,"/",tumor_type,"_biscut_PANCAN_gene_RF.txt"),col.names = T, row.names = F, sep = '\t', quote = F)