library(ggplot2) library(stringr) ##### #This part of scripts produces the plots of alignment data. Only plasmid-aligned reverse reads were taken here. #For every sample was created a set of reads of equal capacity (min between all sample reads). ##### #Uploading data in R t1<-read.csv("../custom_1_90_mapq.txt", sep=";") t2<-read.csv("../custom_2_87_mapq.txt", sep=";") t4<-read.csv("../custom_4_96_mapq.txt", sep=";") t5<-read.csv("../custom_5_92_mapq.txt", sep=";") t6<-read.csv("../custom_6_108_mapq.txt", sep=";") t7<-read.csv("../custom_7_97_mapq.txt", sep=";") tpol<-read.csv("../custom_pol_93_mapq.txt", sep=";") tp<-cbind(t1, "Sample" =1) tp<-rbind(tp, cbind(t2, "Sample" =2)) tp<-rbind(tp, cbind(t4, "Sample" =4)) tp<-rbind(tp, cbind(t5, "Sample" =5)) tp<-rbind(tp, cbind(t6, "Sample" =6)) tp<-rbind(tp, cbind(t7, "Sample" =7)) tp<-rbind(tp, cbind(tpol, "Sample" =9)) samples<-c(2, 4:7, 9) sample2<-subset(tp, Read==2&Sample==1) tp2<-sample2[sample(nrow(sample2), 20000),] for (i in samples) { sample2<-subset(tp, Read==2&Sample==i) tp2<-rbind(tp2, sample2[sample(nrow(sample2), 20000),]) } rm(sample2, t1, t2, t4, t5, t6, t7, tpol) #GC-content ggplot(data = tp2, aes(x = GC)) + geom_density(stat = "density", fill = "pink", alpha = 0.5) + facet_wrap(~Sample, nrow = 3) + geom_vline(xintercept = 50, linetype="dotted") + xlim(25, 75) + ylim(0, 0.15) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/gc.png", width = 2250, height = 1500, units = "px") #Mismatches per cycle of sequence df2<-data.frame("Cycles"=0, "Sample"=1) for (i in c(1:2, 4:7, 9)) { cyc <-subset(tp2, Sample == i)[,4] cyc <- as.numeric(unlist(strsplit(cyc,","))) df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) } df2<-df2[-1,] df2$Sample<-as.factor(df2$Sample) rm (cyc, i) ggplot(df2, aes(x = Cycles)) + geom_density(stat = "count", fill = "blue", alpha = 0.5) + #geom_line(stat = "count", fill = "blue", alpha = 0.5) + facet_wrap(~Sample, nrow = 3) + ggtitle("Mismatches per cycle") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/mpc.png", width = 2250, height = 1500, units = "px") ggplot(df2, aes(x = Cycles)) + geom_density(stat = "density", fill = "blue", alpha = 0.5) + facet_wrap(~Sample, nrow = 3) + ggtitle("Mismatches per cycle") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/mpc_density.png", width = 2250, height = 1500, units = "px") rm (df2) #Mismatches depending on GC-content mgc2<-cbind(tp2[, c("Count_SNPs", "Sample")], "GC" = round(tp2$GC)) ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = sum, geom="line", linewidth = 0.5) + facet_wrap(~Sample, nrow = 3) + ggtitle("Mismatches/GC") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/mgc.png", width = 2250, height = 1500, units = "px") ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = mean, geom="line", size = 0.5) + facet_wrap(~Sample, nrow = 3) + ggtitle("Mismatches/GC") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/mgc_mean.png", width = 2250, height = 1500, units = "px") rm (mgc2) #Frequency of different types of mismatches mtype2 <- apply(tp2[, c(12:23)][tp2$Sample==1,], 2, sum) mtype2 <- data.frame("Sum" = mtype2) mtype2$type<-as.factor(colnames(tp2[, c(12:23)])) mtype2<-cbind(mtype2, "Sample" = 1) for (i in samples) { temp <- apply(tp2[, c(12:23)][tp2$Sample==i,], 2, sum) temp <- data.frame("Sum" = temp) temp$type<-as.factor(colnames(tp2[, c(12:23)])) temp<-cbind(temp, "Sample" = i) mtype2<-rbind(mtype2, temp) } ggplot(mtype2) + geom_line(aes(x = type, y = Sum, group = Sample)) + geom_point(aes(x = type, y = Sum)) + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + facet_wrap(~Sample, nrow = 3, ) + ggtitle("Mismatch type") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_read2/mt.png", width = 2250, height = 1500, units = "px") mtype2_2 <- apply(tp2[tp2$Sample==1, c(12:23)], 2, mean) mtype2_2 <- data.frame("mean" = mtype2_2) mtype2_2$type<-as.factor(colnames(tp2[, c(12:23)])) mtype2_2<-cbind(mtype2_2, "Sample" = 1) for (i in samples) { temp <- apply(tp2[tp2$Sample==i, c(12:23)], 2, mean) temp <- data.frame("mean" = temp) temp$type<-as.factor(colnames(tp2[, c(12:23)])) temp<-cbind(temp, "Sample" = i) mtype2_2<-rbind(mtype2_2, temp) } ggplot(mtype2_2) + geom_line(aes(x = type, y = mean, group = Sample)) + geom_point(aes(x = type, y = mean)) + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + facet_wrap(~Sample, nrow = 3) + ggtitle("Mismatch type") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_read2/mt_mean.png", width = 2250, height = 1500, units = "px") rm(mtype2, mtype2_2) #Number of deletions per sample delets2 <- tp2[tp2$Sample==1,][,10] delets2 <- unlist(strsplit(delets2,"")) delets2<- delets2[delets2 %in% c(letters, LETTERS)] delets2<-cbind("Nucleotide"=delets2, "Sample" = 1) delets2<-data.frame(delets2) delets2$Nucleotide <- as.factor(delets2$Nucleotide) for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp,"")) temp<- temp[temp %in% c(letters, LETTERS)] temp<-cbind("Nucleotide"=temp, "Sample" = i) temp<-data.frame(temp) temp$Nucleotide <- as.factor(temp$Nucleotide) delets2<-rbind(temp, delets2) } ggplot(delets2) + geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") + facet_wrap(~Sample, nrow = 3, scales = "free_y") + #ylim(0, 1500) + ggtitle ("Deletes") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/delets.png", width = 2250, height = 1500, units = "px") rm (delets2) #Number of insertions per sample inserts2 <- tp2[tp2$Sample==1,][,11] inserts2 <- unlist(strsplit(inserts2,"")) inserts2<- inserts2[inserts2 %in% c(letters, LETTERS)] inserts2<-cbind("Nucleotide"=inserts2, "Sample" = 1) inserts2<-data.frame(inserts2) inserts2$Nucleotide <- as.factor(inserts2$Nucleotide) for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp,"")) temp<- temp[temp %in% c(letters, LETTERS)] temp<-cbind("Nucleotide"=temp, "Sample" = i) temp<-data.frame(temp) temp$Nucleotide <- as.factor(temp$Nucleotide) inserts2<-rbind(temp, inserts2) } ggplot(inserts2) + ggtitle("Inserts") + geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") + facet_wrap(~Sample, nrow = 2, scales = "free_y") + #ylim(0, 2000) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/inserts.png", width = 2250, height = 1500, units = "px") rm (inserts2) #Mean, min and max values of insertions per sample insert_mean2<-subset(tp2, Sample == 1)[,7] insert_mean2<-cbind("Sample"=1, t(summary(insert_mean2))) for (i in samples) { temp<-subset(tp2, Mean_length_insert!=0 & Sample == i)[,7] temp<-cbind("Sample"=i, t(summary(temp))) insert_mean2<-rbind(insert_mean2, temp) } ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + scale_y_continuous(breaks = c(0:12), limits = c(0, 13)) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + ggtitle("Insert mean length") + scale_x_discrete(name = "Sample") + scale_y_continuous(name = "Length", breaks = seq(0, 18, 1)) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/iml.png", width = 2250, height = 1500, units = "px") rm (insert_mean2) #Mean, min and max values of deletions per sample delete_mean2<-subset(tp2, Sample == 1)[,6] delete_mean2<-cbind("Sample"=1, t(summary(delete_mean2))) for (i in samples) { temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,6] temp<-cbind("Sample"=i, t(summary(temp))) delete_mean2<-rbind(delete_mean2, temp) } ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + scale_y_continuous(breaks = c(0:22), limits = c(0, 22)) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + ggtitle("Delete mean length") + scale_x_discrete(name = "Sample") + scale_y_continuous(name = "Length", breaks = seq(0, 12, 1)) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/dml.png", width = 2250, height = 1500, units = "px") rm(delete_mean2) #Number of mismatches per sample ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + ggtitle("Mismatches per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/mps_mean.png", width = 2250, height = 1500, units = "px") ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + stat_summary(fun = sum, geom="point") + stat_summary(fun = sum, geom="line") + ggtitle("Mismatches per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/mps.png", width = 2250, height = 1500, units = "px") #Number of deletions per cycle of sequence numbers_only <- function(x) !grepl("\\D", x) dels2 <- tp2[tp2$Sample==1,][,10] dels2 <- unlist(strsplit(dels2, split="|",fixed = TRUE)) dels2 <- unlist(strsplit(dels2,",")) dels2<- dels2[numbers_only(dels2)] dels2<-cbind("Num"=dels2, "Sample" = 1) dels2<-data.frame(dels2) for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) dels2<-rbind(temp, dels2) } dels2$Num<-as.numeric(dels2$Num) ggplot(dels2, aes(x = Num)) + #geom_density(stat = "count", alpha = 0.5) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + facet_wrap(~Sample, nrow = 3, scales = "free_y") + #ylim(0, 1500) + ggtitle ("Deletes per cycle") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/dpc.png", width = 2250, height = 1500, units = "px") rm(dels2) #Number of insertions per cycle of sequence ins2 <- tp2[tp2$Sample==1,][,11] ins2 <- unlist(strsplit(ins2, split="|",fixed = TRUE)) ins2 <- unlist(strsplit(ins2,",")) ins2<- ins2[numbers_only(ins2)] ins2<-cbind("Num"=ins2, "Sample" = 1) ins2<-data.frame(ins2) for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) ins2<-rbind(temp, ins2) } ins2$Num<-as.numeric(ins2$Num) ggplot(ins2, aes(x = Num)) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + facet_wrap(~Sample, nrow = 3, scales = "free_y") + #ylim(0, 1500) + ggtitle ("Inserts per cycle") + scale_x_continuous(breaks=seq(0, 150, 25))+ scale_y_continuous(drop=FALSE) theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/ipc.png", width = 2250, height = 1500, units = "px") rm (ins2) #Number of deletions per sample parse_delets2 <- tp2[tp2$Sample==1,][,10] parse_delets2 <- unlist(strsplit(parse_delets2, split="|",fixed = TRUE)) parse_delets2<-cbind("Num"=parse_delets2, "Sample" = 1) parse_delets2<-data.frame(parse_delets2) for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) parse_delets2<-rbind(temp, parse_delets2) } ggplot(parse_delets2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + ggtitle("Deletes per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/dps.png", width = 2250, height = 1500, units = "px") rm (parse_delets2) #Number of insertions per sample parse_inserts2 <- tp2[tp2$Sample==1,][,11] parse_inserts2 <- unlist(strsplit(parse_inserts2, split="|",fixed = TRUE)) parse_inserts2<-cbind("Num"=parse_inserts2, "Sample" = 1) parse_inserts2<-data.frame(parse_inserts2) for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) parse_inserts2<-rbind(temp, parse_inserts2) } ggplot(parse_inserts2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + ggtitle("Inserts per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_read2/ips.png", width = 2250, height = 1500, units = "px") rm (parse_inserts2, temp) #Frequency of insertion motifs per sample insert_motifs2 <- tp2[tp2$Sample==1,][,11] insert_motifs2 <- unlist(strsplit(insert_motifs2, split="|",fixed = TRUE)) insert_motifs2 <- unlist(strsplit(insert_motifs2,",")) insert_motifs2<- insert_motifs2[!numbers_only(insert_motifs2)] insert_motifs2<-cbind("Num"=insert_motifs2, "Sample" = 1) insert_motifs2<-data.frame(insert_motifs2) for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[!numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) insert_motifs2<-rbind(temp, insert_motifs2) } ggplot(insert_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + ggtitle("Insert motifs per sample") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_read2/imps.png", width = 2250, height = 2500, units = "px") rm (insert_motifs2, temp) #Frequency of deletion motifs per sample delete_motifs2 <- tp2[tp2$Sample==1,][,10] delete_motifs2 <- unlist(strsplit(delete_motifs2, split="|",fixed = TRUE)) delete_motifs2 <- unlist(strsplit(delete_motifs2,",")) delete_motifs2<- delete_motifs2[!numbers_only(delete_motifs2)] delete_motifs2<-cbind("Num"=delete_motifs2, "Sample" = 1) delete_motifs2<-data.frame(delete_motifs2) for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[!numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) delete_motifs2<-rbind(temp, delete_motifs2) } ggplot(delete_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + ggtitle("Deletes motifs per sample") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_read2/dmps.png", width = 2250, height = 2500, units = "px") rm (delete_motifs2, temp)