Last updated: 2018-03-22
Code version: c7928e2
This analysis is to recreate some of the figures from the Mayer paper using my data.
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)
library(workflowr)
Loading required package: rmarkdown
This is workflowr version 0.7.0
Run ?workflowr for help getting started
library(gplots)
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
library(RColorBrewer)
library(scales)
library(reshape2)
Warning: package 'reshape2' was built under R version 3.4.3
Correlation between 2 biological replicates for netseq gene counts. ‘The data set with higher coverage was randomly downsampled to match the total number of reads of the other data set.’
First I will do this pre-filtering.
gene_cov_18486= read.table("../data/NET3-18486.gene.coverage.bed")
colnames(gene_cov_18486)= c("chr", "start", "end", "gene", "score", "strand", "count")
gene_cov_18505= read.table("../data/NET3-18505.gene.coverage.bed")
colnames(gene_cov_18505)= c("chr", "start", "end", "gene", "score", "strand", "count")
Sum for each of the counts:
sum(gene_cov_18486$count)
[1] 166338954
sum(gene_cov_18505$count)
[1] 128882124
plot(gene_cov_18486$count~gene_cov_18505$count, ylim=c(0,50000), xlim=c(0,50000))
Change to RPKM: divide by million of reads in the library and length :
18486: mapped reads: 65189389 18505: mapped reads: 52507749
mapped_18486_mil= 65189389 / 10^6
mapped_18505_mil= 52507749 / 10^6
gene_cov_18486 =gene_cov_18486 %>% mutate(K_length= (end - start)/100) %>% mutate(rpkm=.25 + count/(K_length * mapped_18486_mil))
gene_cov_18505 =gene_cov_18505 %>% mutate(K_length= (end - start)/100) %>% mutate(rpkm= .25 +count/(K_length * mapped_18505_mil))
Plot again:
plot(log10(gene_cov_18486$rpkm) ~log10(gene_cov_18505$rpkm), ylab="log10 RPKM reads 18486", xlab="log10 RPKM reads 18505", main="Fig 1D recreation")
abline(lm(log10(gene_cov_18486$rpkm) ~ log10(gene_cov_18505$rpkm)), col="blue")
correlation:
cor(log10(gene_cov_18486$rpkm),log10(gene_cov_18505$rpkm))
[1] 0.9807117
Create the exon file from gencode.v19.annotation.bed. Filter the exons.
awk '/exon/' gencode.v19.annotation.gtf > gencode.v19.exon.annotation.gtf
Use featureCounts to quantify counts in these exons for 18486.
#!/bin/bash
#SBATCH --job-name=exon_cov
#SBATCH --time=8:00:00
#SBATCH --output=exon_cov.out
#SBATCH --error=exon_cov.err
#SBATCH --partition=broadwl
#SBATCH --mem=20G
#SBATCH --mail-type=END
module load Anaconda3
source activate net-seq
#input is a bed file
sample=$1
describer=$(echo ${sample} | sed -e 's/.*\YG-SP-//' | sed -e "s/_combined_Netpilot-sort.bam$//")
featureCounts -T 5 -a /project2/gilad/briana/genome_anotation_data/gencode.v19.exon.annotation.gtf -t exon -g exon_id -o /project2/gilad/briana/Net-seq-pilot/data/exon_cov/${describer}_combined_Netpilot-sort.exon.cov.txt $1
Run on /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-18486_combined_Netpilot-sort.bam
Filter the exons I want to use for the analsis:
exon_cov_18486= read.table("../data/NET3-18486_combined_Netpilot-sort.exon.cov.txt", header=TRUE)
colnames(exon_cov_18486)= c("ExonID", "Chr", "Start", "End", "Strand", "Length","Count" )
Convert to RPKM
mapped_reads= 40670322 + 9798066 + 14721001
exon_cov_18486= exon_cov_18486 %>% mutate(RPKM=(Count/(Length/1000))/(mapped_reads/10^6))
exon_cov_18486_no0= exon_cov_18486 %>% filter(RPKM!=0)
Plot the exon counts in log10(RPKM):
plot(log10(sort(exon_cov_18486_no0$RPKM, decreasing = TRUE)), col=ifelse(log10(sort(exon_cov_18486_no0$RPKM, decreasing = TRUE))>.28, "red", "black"), ylab = "log10 RPKM", xlab="Exon", main= "RPKM expression of exons 18486")
log10(sort(exon_cov_18486_no0$RPKM, decreasing = TRUE))[1990]
[1] 0.2804453
#39803*.05
Use top 5% - the top 10.
exon_cov_18486_no0_sort= exon_cov_18486_no0[order(exon_cov_18486_no0$RPKM, decreasing = TRUE),]
exon_cov_18486_no0_sort = exon_cov_18486_no0_sort %>% filter(log10(RPKM) > .28)
exon_list_table= exon_cov_18486_no0_sort[6:1911,1:6 ]
#write.table(exon_list_table,"../data/top5_exonlist.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep="\t" )
File is: /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486.bed
I want to make two seperate files, one for the 3’ SS and one for the 5’ splice site.
less top5_exonlist2_18486.txt | awk ' {if($5 == "+") print($1 "\t" $2 "\t" $3 -40 "\t" $3 +40 "\t" $5 ); else print($1 "\t" $2 "\t" $4 - 40 "\t" $4 +40 "\t" $5)}' > top5_exonlist_18486_fiveprime.txt
less top5_exonlist2_18486.txt | awk ' {if($5 == "+") print($1 "\t" $2 "\t" $4 -40 "\t" $4 +40 "\t" $5 ); else print($1 "\t" $2 "\t" $3 - 40 "\t" $3 + 40 "\t" $5)}' > top5_exonlist_18486_threeprime.txt
Problem: exon list has some exons twice in the same line
cp top5_exonlist_18486.txt test.txt
awk '{print($2)}' test.txt| cut -d ";" -f1 > test.col2.txt
awk '{print($3)}' test.txt| cut -d ";" -f1 > test.col3.txt
awk '{print($4)}' test.txt| cut -d ";" -f1 > test.col4.txt
awk '{print($5)}' test.txt| cut -d ";" -f1 > test.col5.txt
awk '{print($1)}' test.txt > test.col1.txt
awk '{print($6)}' test.txt > test.col6.txt
paste -d "\t" test.col1.txt test.col2.txt test.col3.txt test.col4.txt test.col5.txt test.col6.txt > top5_exonlist2_18486.txt
Rerun aboove to seperate the 3’ and 5’
Remove the chr so I can match it to the genome coverage file:
cat top5_exonlist_18486_fiveprime.txt | sed 's/chr//' > top5_exonlist_18486_fiveprime_noCHR.txt
cat top5_exonlist_18486_threeprime.txt | sed 's/chr//' > top5_exonlist_18486_threeprime_noCHR.txt
Make a bed file with just the first base of each of my reads from /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort.bed . For positive reads I use the start and the start +1 and for negative reads I use the end -1 and the end. I then will use bedtools coverage -d -s (force strandedness).
awk '{print ($2 "\t" $3 "\t" $4 "\t" $1 "\t.\t" $5)}' top5_exonlist_18486_threeprime_noCHR.txt| sort -k1,1 -k2,2n > top5_exonlist_18486_threeprime_noCHR.bed
awk '{print ($2 "\t" $3 "\t" $4 "\t" $1 "\t.\t" $5)}' top5_exonlist_18486_fiveprime_noCHR.txt| sort -k1,1 -k2,2n > top5_exonlist_18486_fiveprime_noCHR.bed
sort YG-SP-NET3-18486_combined_Netpilot-sort.bed the same way to make this easier.
sort -k1,1 -k2,2n YG-SP-NET3-18486_combined_Netpilot-sort.bed > YG-SP-NET3-18486_combined_Netpilot-sort2.bed
awk '{if($6 == "+") print($1 "\t" $2 "\t" $2 + 1 "\t" $4 "\t" $5 "\t" $6 ); else print($1 "\t" $3 -1 "\t" $3 "\t" $4 "\t" $5 "\t" $6 )}' YG-SP-NET3-18486_combined_Netpilot-sort2.bed > YG-SP-NET3-18486_combined_Netpilot-sort2-bp.bed
#!/bin/bash
#SBATCH --job-name=ss_cov
#SBATCH --time=8:00:00
#SBATCH --output=ss_cov.out
#SBATCH --error=ss_cov.err
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --mail-type=END
module load Anaconda3
source activate net-seq
bedtools coverage -d -s -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_threeprime_noCHR.bed -b /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort2-bp2.bed > /project2/gilad/briana/Net-seq-pilot/data/ss_cov/top5_exonlist_18486_threeprime_cov.txt
bedtools coverage -d -s -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_fiveprime_noCHR.bed -b /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort2-bp2.bed > /project2/gilad/briana/Net-seq-pilot/data/ss_cov/top5_exonlist_18486_fiveprime_cov.txt
five_prime_cov=read.table("../data/top5_exonlist_18486_fiveprime_cov.txt")
colnames(five_prime_cov)= c("chr", "start", "end", "exon", "score","strand", "pos", "count")
three_prime_cov=read.table("../data/top5_exonlist_18486_threeprime_cov.txt")
colnames(three_prime_cov)= c("chr", "start", "end", "exon","score","strand", "pos", "count")
summarize on window position to make top plot
cov_bypos_pos= five_prime_cov %>% filter(strand=="+") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg= five_prime_cov %>% filter(strand=="-") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_vec= rev(as.numeric(unlist(cov_bypos_neg[,2])))
cov_bypos=cbind(cov_bypos_pos, cov_bypos_neg_vec)
cov_bypos_all= cov_bypos %>% mutate(sum_by_pos=sum_pos + cov_bypos_neg_vec )
cov_bypos_all= cov_bypos_all %>% select(pos, sum_by_pos)
Plot it:
cov_bypos_all_long=melt(cov_bypos_all, id.vars = 'pos', value.name ='count' ) %>% filter(pos > 30 & pos< 50)
ggplot(cov_bypos_all_long,aes(pos,count)) + geom_line() + ggtitle("5' splice site")
3 prime end:
cov_bypos_pos_3= three_prime_cov %>% filter(strand=="+") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_3= three_prime_cov %>% filter(strand=="-") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_vec_3= rev(as.numeric(unlist(cov_bypos_neg_3[,2])))
cov_bypos_3=cbind(cov_bypos_pos_3, cov_bypos_neg_vec_3)
cov_bypos_all_3= cov_bypos_3 %>% mutate(sum_by_pos=sum_pos + cov_bypos_neg_vec_3 )
cov_bypos_all_3= cov_bypos_all_3 %>% select(pos, sum_by_pos)
cov_bypos_all_long_3=melt(cov_bypos_all_3, id.vars = 'pos', value.name ='count' ) %>% filter(pos > 20 & pos< 60)
ggplot(cov_bypos_all_long_3,aes(pos,count)) + geom_line() + ggtitle("3' splice site")
-get rid of the highly expressed bins if one of these exons is in it.
The script splicesite_cov2.sh is not strand specific on mapping coverage.
#!/bin/bash
#SBATCH --job-name=ss_cov2
#SBATCH --time=8:00:00
#SBATCH --output=ss_cov2.out
#SBATCH --error=ss_cov2.err
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --mail-type=END
module load Anaconda3
source activate net-seq
bedtools coverage -d -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_threeprime_noCHR.bed -b /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort2-bp2.bed > /project2/gilad/briana/Net-seq-pilot/data/ss_cov/top5_exonlist_18486_threeprime_cov2.txt
bedtools coverage -d -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_fiveprime_noCHR.bed -b /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort2-bp2.bed > /project2/gilad/briana/Net-seq-pilot/data/ss_cov/top5_exonlist_18486_fiveprime_cov2.txt
five_prime_cov2=read.table("../data/top5_exonlist_18486_fiveprime_cov2.txt", stringsAsFactors = FALSE)
colnames(five_prime_cov2)= c("chr", "start", "end", "exon", "score","strand", "pos", "count")
three_prime_cov2=read.table("../data/top5_exonlist_18486_threeprime_cov2.txt", stringsAsFactors = FALSE)
colnames(three_prime_cov2)= c("chr", "start", "end", "exon","score","strand", "pos", "count")
cov_bypos_pos2= five_prime_cov2 %>% filter(strand=="+") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg2= five_prime_cov2 %>% filter(strand=="-") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_vec2= rev(as.numeric(unlist(cov_bypos_neg2[,2])))
cov_bypos2=cbind(cov_bypos_pos2, cov_bypos_neg_vec2)
cov_bypos_all2= cov_bypos2 %>% mutate(sum_by_pos=sum_pos + cov_bypos_neg_vec )
cov_bypos_all2= cov_bypos_all2 %>% select(pos, sum_by_pos)
Plot it:
cov_bypos_all_long2=melt(cov_bypos_all2, id.vars = 'pos', value.name ='count' )
ggplot(cov_bypos_all_long2,aes(pos,count)) + geom_line() + ggtitle("5' splice site, not SS")
3 prime end:
cov_bypos_pos2_3= three_prime_cov2 %>% filter(strand=="+") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg2_3= three_prime_cov2 %>% filter(strand=="-") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_vec2_3= rev(as.numeric(unlist(cov_bypos_neg2_3[,2])))
cov_bypos2_3=cbind(cov_bypos_pos2_3, cov_bypos_neg_vec2_3)
cov_bypos_all2_3= cov_bypos2_3 %>% mutate(sum_by_pos=sum_pos + cov_bypos_neg_vec2_3 )
cov_bypos_all2_3= cov_bypos_all2_3 %>% select(pos, sum_by_pos)
cov_bypos_all_long2_3=melt(cov_bypos_all2_3, id.vars = 'pos', value.name ='count' ) %>% filter(pos > 20 & pos< 60)
ggplot(cov_bypos_all_long2_3,aes(pos,count)) + geom_line() + ggtitle("3' splice site, not SS")
I will use the files I created in the create_blacklist analysis to filter the exon regions that fall in those locations. Use bedtools intersect:
/project2/gilad/briana/genome_anotation_data/top5_gen_wind200.tab.nochr.sort.bed
project2/gilad/briana/genome_anotation_data/snRNA.gencode.v19.nochr.bed
/project2/gilad/briana/genome_anotation_data/snoRNA.gencode.v19.nochr.bed
/project2/gilad/briana/genome_anotation_data/rRNA.gencode.v19.nochr.bed
splicesite_cov2_filter.sh
#!/bin/bash
#SBATCH --job-name=filt_exoncov
#SBATCH --time=8:00:00
#SBATCH --output=filt_exoncov_sbatch.out
#SBATCH --error=filt_exoncov_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --mail-type=END
module load Anaconda3
source activate net-seq
bedtools intersect -v -wa -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_fiveprime_noCHR.bed -b /project2/gilad/briana/genome_anotation_data/top5_gen_wind200.tab.nochr.sort.bed /project2/gilad/briana/genome_anotation_data/snRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/snoRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/rRNA.gencode.v19.nochr.bed > /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_fiveprime_noCHR_filter.bed
bedtools coverage -d -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_fiveprime_noCHR_filter.bed -b /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort2-bp2.bed > /project2/gilad/briana/Net-seq-pilot/data/ss_cov/top5_exonlist_18486_fiveprime_cov2_filter.txt
bedtools intersect -v -wa -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_threeprime_noCHR.bed -b /project2/gilad/briana/genome_anotation_data/top5_gen_wind200.tab.nochr.sort.bed /project2/gilad/briana/genome_anotation_data/snRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/snoRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/rRNA.gencode.v19.nochr.bed > /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_threeprime_noCHR_filter.bed
bedtools coverage -d -a /project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_threeprime_noCHR_filter.bed -b /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort2-bp2.bed > /project2/gilad/briana/Net-seq-pilot/data/ss_cov/top5_exonlist_18486_threeprime_cov2_filter.txt
This results in 464 3-prime and 445 5-prime exons.
five_prime_cov_F=read.table("../data/top5_exonlist_18486_fiveprime_cov2_filter.txt")
colnames(five_prime_cov_F)= c("chr", "start", "end", "exon", "score","strand", "pos", "count")
three_prime_cov_F=read.table("../data/top5_exonlist_18486_threeprime_cov2_filter.txt")
colnames(three_prime_cov_F)= c("chr", "start", "end", "exon","score","strand", "pos", "count")
cov_bypos_pos_F= five_prime_cov_F %>% filter(strand=="+") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg2_F= five_prime_cov_F %>% filter(strand=="-") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_vec_F= rev(as.numeric(unlist(cov_bypos_neg2_F[,2])))
cov_bypos_F=cbind(cov_bypos_pos_F, cov_bypos_neg_vec_F)
cov_bypos_all_F= cov_bypos_F %>% mutate(sum_by_pos=sum_pos + cov_bypos_neg_vec )
cov_bypos_all_F= cov_bypos_all_F %>% select(pos, sum_by_pos)
Plot it:
cov_bypos_all_long_F=melt(cov_bypos_all_F, id.vars = 'pos', value.name ='count' )
ggplot(cov_bypos_all_long_F,aes(pos,count)) + geom_line() + ggtitle("5' splice site filtered exons")
3 prime end:
cov_bypos_posF_3= three_prime_cov_F %>% filter(strand=="+") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_negF_3= three_prime_cov_F %>% filter(strand=="-") %>% group_by(pos) %>% summarise(sum_pos=sum(count))
cov_bypos_neg_vecF_3= rev(as.numeric(unlist(cov_bypos_negF_3[,2])))
cov_byposF_3=cbind(cov_bypos_posF_3, cov_bypos_neg_vecF_3)
cov_bypos_allF_3= cov_byposF_3 %>% mutate(sum_by_pos=sum_pos + cov_bypos_neg_vec2_3 )
cov_bypos_allF_3= cov_bypos_allF_3 %>% select(pos, sum_by_pos)
cov_bypos_all_longF_3=melt(cov_bypos_allF_3, id.vars = 'pos', value.name ='count' )
ggplot(cov_bypos_all_longF_3,aes(pos,count)) + geom_line() + ggtitle("3' splice site filtered exons")
I want the data to be in a matrix exon by position. The row name is the every 80th entry of of the coverage file (1, 81, 161) and there should be 1906 rows. The rows are the 80 counts in the count column of the coverage file. I need to make a for loop through the rows that goes through seq(1,152480,80). I run this on cov_bypos_pos2_small first.
#five_prime_cov2_small=five_prime_cov2[1:160,]
#update data upload to have string at factor =false
make_matrix=function(cov_df){
matrix2=c(1:80)
names2=vector()
for (i in seq(1,nrow(cov_df),80)){
names2= append(names2,cov_df[i,4])
row_i=vector()
for (ea in seq(0,79,1)){
row_i= append(row_i, cov_df[i + ea,8])
}
matrix2=rbind(matrix2, row_i)
}
return(as.matrix(matrix2[2:nrow(matrix2),]))
}
Try this on the five prime coverage2 file. I need to normalize the points in the matrix so some are not super high.
five_prime_matrix2=make_matrix(five_prime_cov2)
five_prime_matrix2_norm=apply(t(five_prime_matrix2), 1, rescale)
Make an image picture:
my_palette <- colorRampPalette(c("white", "black"))(n = 100)
image(five_prime_matrix2_norm,col=my_palette, ylab="exon", xlab="Position")
Try on 3prime:
three_prime_matrix2=make_matrix(three_prime_cov2)
three_prime_matrix2_norm=apply(t(three_prime_matrix2), 1, rescale)
image(three_prime_matrix2_norm,col=my_palette, ylab="exon", xlab="Position")
Try with a binary matrix:
make_binary=function(matrix){
new_matrix= matrix(NA, nrow = nrow(matrix), ncol = ncol(matrix))
for(row in 1:nrow(matrix)) {
for(col in 1:ncol(matrix)) {
if(matrix[row,col]!= 0){
new_matrix[row,col]= 1
}
else{
new_matrix[row,col] = 0
}
}
}
return(new_matrix)
}
five_prime_matrix2_bin=make_binary(five_prime_matrix2)
three_prime_matrix2_bin= make_binary(three_prime_matrix2)
Try image with these:
image(three_prime_matrix2_bin,col=my_palette, ylab="exon", xlab="Position", main="Three prime coverage")
image(five_prime_matrix2_bin,col=my_palette, ylab="exon", xlab="Position", main="Five prime coverage")
Try to find a natural cuttoff to change the highest reads:
vec_five_prime_matrix2=vector("numeric", length = 152480)
for(row in 1:nrow(five_prime_matrix2)) {
for(col in 1:ncol(five_prime_matrix2)) {
vec_five_prime_matrix2[(row*col)]=five_prime_matrix2[row,col]
}
}
summary(vec_five_prime_matrix2)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 2.06 0.00 128251.00
plot(log10(vec_five_prime_matrix2))
abline(h=2, col="red")
quantile(vec_five_prime_matrix2,c(.995, .998, .999))
99.5% 99.8% 99.9%
6.000 36.000 125.042
Make any value above top .2% value (36)==36
cut_top=function(matrix){
new_matrix= matrix(NA, nrow = nrow(matrix), ncol = ncol(matrix))
for(row in 1:nrow(matrix)) {
for(col in 1:ncol(matrix)) {
if(matrix[row,col]>= 36){
new_matrix[row,col]= 36
}
else{
new_matrix[row,col] = matrix[row,col]
}
}
}
return(new_matrix)
}
three_prime_matrix2_998=cut_top(three_prime_matrix2)
five_prime_matrix2_998=cut_top(five_prime_matrix2)
Create the image:
image(three_prime_matrix2_998,col=my_palette, ylab="exon", xlab="Position", main="Three prime coverage")
image(five_prime_matrix2_998,col=my_palette, ylab="exon", xlab="Position", main="Five prime coverage")
I need to pull the data frames in pos and neg seperatly!! these images do not have flipped rows for neg strand
Run on just the positive strand first:
cov_bypos_pos2_matrix= five_prime_cov2 %>% filter(strand=="+") %>% make_matrix()
image(cov_bypos_pos2_matrix,col=my_palette, ylab="exon", xlab="Position", main="Positive 5 prime ")
cov_bypos_pos2_3_matrix = three_prime_cov2 %>% filter(strand=="+") %>% make_matrix()
image(cov_bypos_pos2_3_matrix,col=my_palette, ylab="exon", xlab="Position", main="Positive 3 prime ")
Do the same with the negative:
cov_bypos_neg2_matrix= five_prime_cov2 %>% filter(strand=="-") %>% make_matrix()
image(cov_bypos_neg2_matrix,col=my_palette, ylab="exon", xlab="Position", main="Negative 5 prime ")
cov_bypos_neg2_3_matrix= three_prime_cov2 %>% filter(strand=="-") %>% make_matrix()
image(cov_bypos_neg2_3_matrix,col=my_palette, ylab="exon", xlab="Position", main="Negative 3 prime ")
Make a function to reverse each row of the negative matricies:
rev_rows=function(matrix){
new_matrix= matrix(NA, nrow = nrow(matrix), ncol = ncol(matrix))
for (row in 1:nrow(matrix)){
x= rev(matrix[row,])
new_matrix[row,]=x
}
return(new_matrix)
}
Use this function to flip the negative matrix then bind it to the original. I will also cut the top to better the vizualization.
flip_neg2_5prime=rev_rows(cov_bypos_neg2_matrix)
full_5prime_matrix_cut= rbind(cov_bypos_pos2_matrix,flip_neg2_5prime) %>% cut_top()
flip_neg2_3prime=rev_rows(cov_bypos_neg2_3_matrix)
full_3prime_matrix_cut=rbind(cov_bypos_pos2_3_matrix, flip_neg2_3prime) %>% cut_top()
Images
image(full_5prime_matrix_cut,col=my_palette, ylab="exon", xlab="Position", main="5' all exons")
image(full_3prime_matrix_cut,col=my_palette, ylab="exon", xlab="Position", main="3' all exons")
Make these with the ggplot version:
heatmap.2(full_5prime_matrix_cut,dendrogram='none', Rowv=FALSE, Colv=FALSE,trace='none')
Try to make the top plots with mean and sd:
full_5prime_matrix= rbind(cov_bypos_pos2_matrix,flip_neg2_5prime)
means_5prime=apply(full_5prime_matrix, 2, mean)
sd_5prime=apply(full_5prime_matrix, 2, sd)
df_5prime=as.data.frame(cbind(pos=seq(-40,39,1), mean=means_5prime, SD=sd_5prime))
df_5prime_long= melt(df_5prime, id.vars="pos", measure.vars = c("mean", "SD"), varnames =c("mean", "SD"))
Plot the mean at each position:
ggplot(df_5prime,aes(x=pos, y=mean)) + geom_line() + ggtitle("5' splice site") + geom_ribbon(aes(x=pos, ymin=mean - SD,ymax=mean + SD), alpha=0.20) + xlim(c(-20,20))
Warning: Removed 39 rows containing missing values (geom_path).
Same for 3prime:
full_3prime_matrix=rbind(cov_bypos_pos2_3_matrix, flip_neg2_3prime)
means_3prime=apply(full_3prime_matrix, 2, mean)
sd_3prime=apply(full_3prime_matrix, 2, sd)
df_3prime=as.data.frame(cbind(pos=seq(-40,39,1), mean=means_3prime, SD=sd_3prime))
df_3prime_long= melt(df_3prime, id.vars="pos", measure.vars = c("mean", "SD"), varnames =c("mean", "SD"))
ggplot(df_3prime,aes(x=pos, y=mean)) + geom_line() + ggtitle("3' splice site mean coverage")+ geom_ribbon(aes(x=pos, ymin=mean - SD,ymax=mean + SD), alpha=0.20) +
xlim(c(-20,20))
Warning: Removed 39 rows containing missing values (geom_path).
Normalize per exon with my own function and recreate this with the means of the normalized values. I will write the normalization function for rows then use apply for the matrix.
normalize_row=function(row){
new_row=c()
min= as.numeric(min(row))
max= as.numeric(max(row))
for (val in row){
newval= (val-min)/(max-min)
new_row=append(new_row, newval)
}
return(new_row)
}
apply_row= function(matrix){
new_matrix= matrix(NA, nrow = nrow(matrix), ncol = ncol(matrix))
for (row in 1:nrow(matrix)){
x=normalize_row(matrix[row,])
new_matrix[row,]=x
}
return(new_matrix)
}
full_3prime_matrix_norm= apply_row(full_3prime_matrix)
full_3prime_matrix_norm= na.omit(full_3prime_matrix_norm)
means_3prime_norm=apply(full_3prime_matrix_norm, 2, mean)
sd_3prime_norm=apply(full_3prime_matrix_norm, 2, sd)
df_3prime_norm=as.data.frame(cbind(pos=seq(-40,39,1), mean=means_3prime_norm, SD=sd_3prime_norm))
norm_mean_plot3=ggplot(df_3prime_norm,aes(x=pos, y=mean)) + geom_line() + ggtitle("3' splice site mean coverage normalized")+ geom_ribbon(aes(x=pos, ymin=mean - SD,ymax=mean + SD), alpha=0.20)
full_5prime_matrix_norm= apply_row(full_5prime_matrix)
full_5prime_matrix_norm= na.omit(full_5prime_matrix_norm)
means_5prime_norm=apply(full_3prime_matrix_norm, 2, mean)
sd_5prime_norm=apply(full_5prime_matrix_norm, 2, sd)
df_5prime_norm=as.data.frame(cbind(pos=seq(-40,39,1), mean=means_5prime_norm, SD=sd_5prime_norm))
norm_mean_plot5=ggplot(df_5prime_norm,aes(x=pos, y=mean)) + geom_line() + ggtitle("5' splice site mean coverage normalized")+ geom_ribbon(aes(x=pos, ymin=mean - SD,ymax=mean + SD), alpha=0.20)
#heatmap_norm5prime=heatmap.2(full_5prime_matrix_norm,dendrogram='none', col=my_palette, Rowv=FALSE, Colv=FALSE,trace='none')
#heatmap_norm3prime=heatmap.2(full_3prime_matrix_norm,dendrogram='none', col=my_palette, Rowv=FALSE, Colv=FALSE,trace='none')
Plot the plots together:
par(mfrow=c(2,2))
heatmap.2(full_5prime_matrix_norm,dendrogram='none', col=my_palette, Rowv=FALSE, Colv=FALSE,trace='none')
heatmap.2(full_3prime_matrix_norm,dendrogram='none', col=my_palette, Rowv=FALSE, Colv=FALSE,trace='none')
norm_mean_plot5
norm_mean_plot3
I am going to go back to the strand specific data and use the final processing steps I used for this data:
cov_bypos_neg_matrix_ss= five_prime_cov %>% filter(strand=="-") %>% make_matrix()
cov_bypos_pos_matrix_ss= five_prime_cov %>% filter(strand=="+") %>% make_matrix()
cov_bypos_neg_3_matrix_ss= three_prime_cov %>% filter(strand=="-") %>% make_matrix()
cov_bypos_pos_3_matrix_ss= three_prime_cov %>% filter(strand=="+") %>% make_matrix()
flip_neg_5prime_ss=rev_rows(cov_bypos_neg_matrix_ss)
full_5prime_matrix_ss= rbind(cov_bypos_pos_matrix_ss,flip_neg_5prime_ss)
flip_neg_3prime_ss=rev_rows(cov_bypos_neg_3_matrix_ss)
full_3prime_matrix_ss=rbind(cov_bypos_pos_3_matrix_ss, flip_neg_3prime_ss)
full_3prime_matrix_norm_ss= apply_row(full_3prime_matrix_ss)
full_3prime_matrix_norm_ss= na.omit(full_3prime_matrix_norm_ss)
means_3prime_norm_ss=apply(full_3prime_matrix_norm_ss, 2, mean)
sd_3prime_norm_ss=apply(full_3prime_matrix_norm_ss, 2, sd)
df_3prime_norm_ss=as.data.frame(cbind(pos=seq(-40,39,1), mean=means_3prime_norm_ss, SD=sd_3prime_norm_ss))
norm_mean_plot3_ss=ggplot(df_3prime_norm_ss,aes(x=pos, y=mean)) + geom_line() + ggtitle("3' splice site mean coverage SS")+ geom_ribbon(aes(x=pos, ymin=mean - SD,ymax=mean + SD), alpha=0.20)
full_5prime_matrix_norm_ss= apply_row(full_5prime_matrix_ss)
full_5prime_matrix_norm_ss= na.omit(full_5prime_matrix_norm_ss)
means_5prime_norm_ss=apply(full_3prime_matrix_norm_ss, 2, mean)
sd_5prime_norm_ss=apply(full_5prime_matrix_norm_ss, 2, sd)
df_5prime_norm_ss=as.data.frame(cbind(pos=seq(-40,39,1), mean=means_5prime_norm_ss, SD=sd_5prime_norm_ss))
norm_mean_plot5_ss=ggplot(df_5prime_norm_ss,aes(x=pos, y=mean)) + geom_line() + ggtitle("5' splice site mean coverage SS")+ geom_ribbon(aes(x=pos, ymin=mean - SD,ymax=mean + SD), alpha=0.20)
par(mfrow=c(2,2))
heatmap.2(full_5prime_matrix_norm_ss,dendrogram='none', col=my_palette, Rowv=FALSE, Colv=FALSE,trace='none')
heatmap.2(full_3prime_matrix_norm_ss,dendrogram='none', col=my_palette, Rowv=FALSE, Colv=FALSE,trace='none')
norm_mean_plot5_ss
norm_mean_plot3_ss
You lose all signal when you do it this way. The 3prime matrix only has 118 exons remaining and the 5prime matrix only has 148.
Now make a python script that takes this and the genome coverage file in as a data frame. I will then make a list of the base counts corresponding to each exon region from the genome coverage file. Do it for the 5 prime file first.
import pandas as pd
exon_5prime= pd.read_table("/project2/gilad/briana/Net-seq-pilot/data/exon_cov/top5_exonlist_18486_fiveprime_noCHR.txt", header=None)
gen_cov= pd.read_table("/project2/gilad/briana/Net-seq-pilot/data/cov/YG-SP-NET3-18486_combined_Netpilot-sort.cov.bed", header=None)
#will make a list of lists that I will turn into a matrix later
exon_list=[]
for exon in exon_5prime.iterrows():
chr=exon[1]
start=exon[2]
end=exon[3]
Test on exon_5prime=pd.read_table(“/project2/gilad/briana/Net-seq-pilot/data/exon_cov/exon_5test.txt”, header=None)
awk 'BEGIN {ORS=","}; $1==1 && $2>=11968288 && $2<=11968368 {print $3}; END {print "\n"}' /project2/gilad/briana/Net-seq-pilot/data/cov/YG-SP-NET3-18486_combined_Netpilot-sort.cov.bed
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 reshape2_1.4.3 scales_0.5.0
[4] RColorBrewer_1.1-2 gplots_3.0.1 workflowr_0.7.0
[7] rmarkdown_1.8.5 ggplot2_2.2.1 dplyr_0.7.4
loaded via a namespace (and not attached):
[1] Rcpp_0.12.15 pillar_1.1.0 compiler_3.4.2
[4] git2r_0.21.0 plyr_1.8.4 bindr_0.1
[7] bitops_1.0-6 tools_3.4.2 digest_0.6.14
[10] jsonlite_1.5 evaluate_0.10.1 tibble_1.4.2
[13] gtable_0.2.0 pkgconfig_2.0.1 rlang_0.1.6
[16] yaml_2.1.16 stringr_1.2.0 knitr_1.18
[19] gtools_3.5.0 caTools_1.17.1 rprojroot_1.3-2
[22] grid_3.4.2 reticulate_1.4 glue_1.2.0
[25] R6_2.2.2 gdata_2.18.0 magrittr_1.5
[28] backports_1.1.2 htmltools_0.3.6 assertthat_0.2.0
[31] colorspace_1.3-2 labeling_0.3 KernSmooth_2.23-15
[34] stringi_1.1.6 lazyeval_0.2.1 munsell_0.4.3
This R Markdown site was created with workflowr