diff --git a/bacterial_genome_zvl.R b/bacterial_genome_zvl.R new file mode 100644 index 0000000..00dd2e0 --- /dev/null +++ b/bacterial_genome_zvl.R @@ -0,0 +1,389 @@ +library(ggplot2) +library(stringr) +##### +#This part of scripts produces the plots of alignment data in a directory "../report/images_assembly_zvl/" with format ".png". Only genome-aligned reverse reads were taken here. +#For every sample was created a set of reads of equal capacity (min between all sample reads, estimated manually). +##### + +#Uploading data in R +table_set <- data.frame(matrix(ncol = 24, nrow = 0)) +colnames(table_set) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) + +temp_name<-read.table("names_of_custom_chromosome.txt") +samples <- c() +for (i in temp_name[,c("V1")]) { + samples<-c(samples, gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i))) +} + +for (i in samples) { + filename<-paste("custom_", i, "_genome_mapq20_table.txt", sep='') + temp<-read.csv(filename, sep=";", header=TRUE) + temp<-subset(temp, Read==2) + temp<-temp[sample(nrow(temp), 200000),] + temp<-cbind(temp, "Sample"=i) + table_set<-rbind(table_set, temp) +} +rm(temp) +table_set$Sample<-as.factor(table_set$Sample) +dir.create("../report/images_assembly_zvl") + +#GC-content + +ggplot(data = table_set, aes(x = GC)) + + geom_density(stat = "density", fill = "pink", alpha = 0.5) + + facet_wrap(~Sample, nrow = 3) + + geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/gc.png", width = 2250, height = 1500, units = "px") + +#Mismatches per cycle of sequence + +df2<-data.frame("Cycles"=0, "Sample"=1) +for (i in samples) { + cyc <-subset(table_set, Sample == i)[,4] + cyc <- as.numeric(unlist(strsplit(cyc,","))) + df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) +} +df2<-df2[-1,] +df2$Sample<-as.factor(df2$Sample) +df2$Cycles<-as.numeric(df2$Cycles) +rm (cyc, i) + +ggplot(df2, aes(x = Cycles)) + + geom_density(stat = "count", fill = "blue", alpha = 0.5) + + #geom_line(stat = "count", fill = "blue", alpha = 0.5) + + facet_wrap(~Sample, nrow = 3) + + ggtitle("Mismatches per cycle") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/mpc.png", width = 2250, height = 1500, units = "px") +ggplot(df2, aes(x = Cycles)) + + geom_density(stat = "density", fill = "blue", alpha = 0.5) + + facet_wrap(~Sample, nrow = 3) + + ggtitle("Mismatches per cycle") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/mpc_density.png", width = 2250, height = 1500, units = "px") +rm (df2) + +#Mismatches depending on GC-content + +mgc2<-cbind(table_set[, c("Count_SNPs", "Sample")], "GC" = round(table_set$GC)) + +ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + + stat_summary(fun = sum, geom="line", linewidth = 0.5) + + facet_wrap(~Sample, nrow = 3) + + ggtitle("Mismatches/GC") + + geom_vline(xintercept = 50, linetype="dotted") + + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/mgc.png", width = 2250, height = 1500, units = "px") +ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + + stat_summary(fun = mean, geom="line", size = 0.5) + + facet_wrap(~Sample, nrow = 3) + + ggtitle("Mismatches/GC") + + geom_vline(xintercept = 50, linetype="dotted") + + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/mgc_mean.png", width = 2250, height = 1500, units = "px") + +rm (mgc2) + +#Frequency of different types of mismatches + +mtype2 <- data.frame(matrix(ncol = 3, nrow = 0)) +colnames(mtype2)<- c("Sum", "type", "Sample") + +for (i in samples) { + temp <- apply(table_set[, c(12:23)][table_set$Sample==i,], 2, sum) + temp <- data.frame("Sum" = temp) + temp$type<-as.factor(colnames(table_set[, c(12:23)])) + temp<-cbind(temp, "Sample" = i) + mtype2<-rbind(mtype2, temp) +} +rm(temp) + +ggplot(mtype2) + + geom_line(aes(x = type, y = Sum, group = Sample)) + + geom_point(aes(x = type, y = Sum)) + + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 3, ) + + ggtitle("Mismatch type") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_assembly_zvl/mt.png", width = 2250, height = 1500, units = "px") + +mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0)) +colnames(mtype2_2)<- c("mean", "type", "Sample") +for (i in samples) { + temp <- apply(table_set[table_set$Sample==i, c(12:23)], 2, mean) + temp <- data.frame("mean" = temp) + temp$type<-as.factor(colnames(table_set[, c(12:23)])) + temp<-cbind(temp, "Sample" = i) + mtype2_2<-rbind(mtype2_2, temp) +} +mtype2_2$Sample<-as.factor(mtype2_2$Sample) + +ggplot(mtype2_2) + + geom_line(aes(x = type, y = mean, group = Sample)) + + geom_point(aes(x = type, y = mean)) + + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 3) + + ggtitle("Mismatch type") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_assembly_zvl/mt_mean.png", width = 2250, height = 1500, units = "px") + +rm (mtype2, mtype2_2) + +#Number of deletions per sample + +delets2 <- data.frame(matrix(ncol = 2, nrow = 0)) +colnames(delets2)<- c("Nucleotide", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,10] + temp <- unlist(strsplit(temp,"")) + temp<- temp[temp %in% c(letters, LETTERS)] + temp<-cbind("Nucleotide"=temp, "Sample" = i) + temp<-data.frame(temp) + temp$Nucleotide <- as.factor(temp$Nucleotide) + delets2<-rbind(temp, delets2) +} +delets2$Sample<-as.factor(delets2$Sample) +rm (temp) + +ggplot(delets2) + + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 1500) + + ggtitle ("Deletions") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/delets.png", width = 2250, height = 1500, units = "px") + +rm (delets2) + +#Number of insertions per sample + +inserts2 <- data.frame(matrix(ncol = 2, nrow = 0)) +colnames(inserts2)<- c("Nucleotide", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,11] + temp <- unlist(strsplit(temp,"")) + temp<- temp[temp %in% c(letters, LETTERS)] + temp<-cbind("Nucleotide"=temp, "Sample" = i) + temp<-data.frame(temp) + temp$Nucleotide <- as.factor(temp$Nucleotide) + inserts2<-rbind(temp, inserts2) +} +inserts2$Sample<-as.factor(inserts2$Sample) + +ggplot(inserts2) + + ggtitle("Insertions") + + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + + facet_wrap(~Sample, nrow = 2) + + #ylim(0, 2000) + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/inserts.png", width = 2250, height = 1500, units = "px") + +rm (inserts2) + +#Mean, min and max values of insertions per sample + +insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) +colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") +for (i in samples) { + temp<-subset(table_set, Mean_length_insert!=0 & Sample == i)[,7] + temp<-cbind("Sample"=i, t(summary(temp))) + insert_mean2<-rbind(insert_mean2, temp) +} +insert_mean2[, 2]<-as.numeric(insert_mean2[, 2]) +insert_mean2[, 4]<-as.numeric(insert_mean2[, 4]) +insert_mean2[, 5]<-as.numeric(insert_mean2[, 5]) +insert_mean2[, 7]<-as.numeric(insert_mean2[, 7]) +ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + + geom_pointrange(aes(ymin = 0, ymax = Max.)) + + #scale_y_continuous(breaks = c(0:12), limits = c(0, 13)) + + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + + ggtitle("Insertion mean length") + + scale_x_discrete(name = "Sample") + + #scale_y_continuous(name = "Length", breaks = seq(0, 18, 1)) + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/iml.png", width = 2250, height = 1500, units = "px") + +rm (insert_mean2) + +#Mean, min and max values of deletions per sample + +delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) +colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") +for (i in samples) { + temp<-subset(table_set, Mean_length_del!=0 & Sample == i)[,6] + temp<-cbind("Sample"=i, t(summary(temp))) + delete_mean2<-rbind(delete_mean2, temp) +} +delete_mean2[, 2]<-as.numeric(delete_mean2[, 2]) +delete_mean2[, 4]<-as.numeric(delete_mean2[, 4]) +delete_mean2[, 5]<-as.numeric(delete_mean2[, 5]) +delete_mean2[, 7]<-as.numeric(delete_mean2[, 7]) +ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + + geom_pointrange(aes(ymin = 0, ymax = Max.)) + + #scale_y_continuous(breaks = c(0:22), limits = c(0, 22)) + + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + + ggtitle("Deletion mean length") + + scale_x_discrete(name = "Sample") + + #scale_y_continuous(name = "Length", breaks = seq(0, 12, 1)) + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/dml.png", width = 2250, height = 1500, units = "px") + +rm (delete_mean2) + +#Number of mismatches per sample + +ggplot(table_set, aes(x=Sample, y=Count_SNPs, group = 1)) + + stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + ggtitle("Mismatches per sample") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/mps_mean.png", width = 2250, height = 1500, units = "px") + + +ggplot(table_set, aes(x=Sample, y=Count_SNPs, group = 1)) + + stat_summary(fun = mean, geom="point") + + stat_summary(fun = mean, geom="line") + + ggtitle("Mismatches per sample") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/mps.png", width = 2250, height = 1500, units = "px") + +#Number of deletions per cycle of sequence + +numbers_only <- function(x) !grepl("\\D", x) + +dels2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(dels2)<-c("Num", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,10] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp<- temp[numbers_only(temp)] + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + dels2<-rbind(temp, dels2) +} +dels2$Num<-as.numeric(dels2$Num) +ggplot(dels2, aes(x = Num)) + + #geom_density(stat = "count", alpha = 0.5) + + stat_count(geom="line", position="identity") + + stat_count(geom="point", position="identity", size = 1) + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 1500) + + ggtitle ("Deletions per cycle") + + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/dpc.png", width = 2250, height = 1500, units = "px") + +rm (dels2, temp) + +#Number of insertions per cycle of sequence + +ins2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(ins2)<-c("Num", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,11] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp<- temp[numbers_only(temp)] + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + ins2<-rbind(temp, ins2) +} +ins2$Num<-as.numeric(ins2$Num) +ggplot(ins2, aes(x = Num)) + + stat_count(geom="line", position="identity") + + stat_count(geom="point", position="identity", size = 1) + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 1500) + + ggtitle ("Insertions per cycle") + + scale_x_continuous(breaks=seq(0, 150, 25))+ + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/ipc.png", width = 2250, height = 1500, units = "px") +rm (ins2, temp) + +#Number of deletions per sample + +parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(parse_delets2)<-c("Num", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,10] + temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + parse_delets2<-rbind(temp, parse_delets2) +} +ggplot(parse_delets2, aes(x=Sample, group = 1)) + + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + ggtitle("Deletions per sample") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/dps.png", width = 2250, height = 1500, units = "px") +rm (parse_delets2, temp) + +#Number of insertions per sample + +parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(parse_inserts2)<-c("Num", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,11] + temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + parse_inserts2<-rbind(temp, parse_inserts2) +} +ggplot(parse_inserts2, aes(x=Sample, group = 1)) + + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + ggtitle("Insertions per sample") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_assembly_zvl/ips.png", width = 2250, height = 1500, units = "px") +rm (parse_inserts2, temp) + +#Frequency of insertion motifs per sample + +insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(insert_motifs2)<-c("Num", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,11] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp <- temp[!numbers_only(temp)] + temp <- cbind("Num"=temp, "Sample" = i) + temp <- data.frame(temp) + insert_motifs2 <- rbind(temp, insert_motifs2) +} +ggplot(insert_motifs2, aes(x = Num)) + + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + + facet_wrap(~Sample, nrow=4, scales = "free_y") + + ggtitle("Insertion motifs per sample") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_assembly_zvl/imps.png", width = 8250, height = 2500, units = "px") +rm (temp, insert_motifs2) + +#Frequency of deletion motifs per sample + +delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(delete_motifs2)<-c("Num", "Sample") +for (i in samples) { + temp <- table_set[table_set$Sample==i,][,10] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp <- temp[!numbers_only(temp)] + temp <- cbind("Num"=temp, "Sample" = i) + temp <- data.frame(temp) + delete_motifs2 <- rbind(temp, delete_motifs2) +} +ggplot(delete_motifs2, aes(x = Num)) + + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + + facet_wrap(~Sample, nrow=7, scales = "free_y") + + ggtitle("Deletions motifs per sample") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_assembly_zvl/dmps.png", width = 8250, height = 3500, units = "px") +rm (delete_motifs2, temp) \ No newline at end of file diff --git a/plasmid.R b/plasmid.R index e8b68e2..b2fa63a 100644 --- a/plasmid.R +++ b/plasmid.R @@ -7,67 +7,60 @@ library(stringr) #Uploading data in R -t1<-read.csv("../custom_1_90_mapq.txt", sep=";") -t2<-read.csv("../custom_2_87_mapq.txt", sep=";") -t4<-read.csv("../custom_4_96_mapq.txt", sep=";") -t5<-read.csv("../custom_5_92_mapq.txt", sep=";") -t6<-read.csv("../custom_6_108_mapq.txt", sep=";") -t7<-read.csv("../custom_7_97_mapq.txt", sep=";") -tpol<-read.csv("../custom_pol_93_mapq.txt", sep=";") - -tp<-cbind(t1, "Sample" =1) -tp<-rbind(tp, cbind(t2, "Sample" =2)) -tp<-rbind(tp, cbind(t4, "Sample" =4)) -tp<-rbind(tp, cbind(t5, "Sample" =5)) -tp<-rbind(tp, cbind(t6, "Sample" =6)) -tp<-rbind(tp, cbind(t7, "Sample" =7)) -tp<-rbind(tp, cbind(tpol, "Sample" =9)) - -samples<-c(2, 4:7, 9) - -sample2<-subset(tp, Read==2&Sample==1) -tp2<-sample2[sample(nrow(sample2), 20000),] -for (i in samples) { - sample2<-subset(tp, Read==2&Sample==i) - tp2<-rbind(tp2, sample2[sample(nrow(sample2), 20000),]) +temp_name<-read.table("names_of_custom_plasmid.txt") +samples <- c() +for (i in temp_name[,c("V1")]) { + samples<-c(samples, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) } -rm(sample2, t1, t2, t4, t5, t6, t7, tpol) +tp2 <- data.frame(matrix(ncol = 24, nrow = 0)) +colnames(tp2) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) +for (i in temp_name[,c("V1")]) { + temp<-read.csv(i, sep=";", header=TRUE) + temp<-subset(temp, Read==2) + temp<-temp[sample(nrow(temp), 200000),] + temp<-cbind(temp, "Sample"=gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) + tp2<-rbind(tp2, temp) +} +rm(temp, temp_name) +tp2$Sample<-as.factor(tp2$Sample) +dir.create("../report/images_new_plasmid") #GC-content ggplot(data = tp2, aes(x = GC)) + geom_density(stat = "density", fill = "pink", alpha = 0.5) + - facet_wrap(~Sample, nrow = 3) + + facet_wrap(~Sample, nrow = 2) + geom_vline(xintercept = 50, linetype="dotted") + xlim(25, 75) + ylim(0, 0.15) + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/gc.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/gc.png", width = 2250, height = 1500, units = "px") #Mismatches per cycle of sequence -df2<-data.frame("Cycles"=0, "Sample"=1) -for (i in c(1:2, 4:7, 9)) { +df2<-data.frame(matrix(nrow = 0, ncol = 2)) +colnames(df2)<-c("Cycles", "Sample") +for (i in samples) { cyc <-subset(tp2, Sample == i)[,4] cyc <- as.numeric(unlist(strsplit(cyc,","))) df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) } -df2<-df2[-1,] df2$Sample<-as.factor(df2$Sample) +df2$Cycles<-as.numeric(df2$Cycles) rm (cyc, i) ggplot(df2, aes(x = Cycles)) + geom_density(stat = "count", fill = "blue", alpha = 0.5) + #geom_line(stat = "count", fill = "blue", alpha = 0.5) + - facet_wrap(~Sample, nrow = 3) + + facet_wrap(~Sample, nrow = 2) + ggtitle("Mismatches per cycle") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/mpc.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mpc.png", width = 2250, height = 1500, units = "px") ggplot(df2, aes(x = Cycles)) + geom_density(stat = "density", fill = "blue", alpha = 0.5) + - facet_wrap(~Sample, nrow = 3) + + facet_wrap(~Sample, nrow = 2) + ggtitle("Mismatches per cycle") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/mpc_density.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mpc_density.png", width = 2250, height = 1500, units = "px") rm (df2) #Mismatches depending on GC-content @@ -75,28 +68,26 @@ rm (df2) mgc2<-cbind(tp2[, c("Count_SNPs", "Sample")], "GC" = round(tp2$GC)) ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = sum, geom="line", linewidth = 0.5) + - facet_wrap(~Sample, nrow = 3) + + facet_wrap(~Sample, nrow = 2) + ggtitle("Mismatches/GC") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/mgc.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mgc.png", width = 2250, height = 1500, units = "px") ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = mean, geom="line", size = 0.5) + - facet_wrap(~Sample, nrow = 3) + + facet_wrap(~Sample, nrow = 2) + ggtitle("Mismatches/GC") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/mgc_mean.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mgc_mean.png", width = 2250, height = 1500, units = "px") rm (mgc2) #Frequency of different types of mismatches -mtype2 <- apply(tp2[, c(12:23)][tp2$Sample==1,], 2, sum) -mtype2 <- data.frame("Sum" = mtype2) -mtype2$type<-as.factor(colnames(tp2[, c(12:23)])) -mtype2<-cbind(mtype2, "Sample" = 1) +mtype2 <- data.frame(matrix(ncol = 3, nrow = 0)) +colnames(mtype2)<- c("Sum", "type", "Sample") for (i in samples) { temp <- apply(tp2[, c(12:23)][tp2$Sample==i,], 2, sum) temp <- data.frame("Sum" = temp) @@ -107,17 +98,14 @@ for (i in samples) { ggplot(mtype2) + geom_line(aes(x = type, y = Sum, group = Sample)) + geom_point(aes(x = type, y = Sum)) + - #facet_wrap(~Sample, nrow = 3, scales = "free_y") + - facet_wrap(~Sample, nrow = 3, ) + + #facet_wrap(~Sample, nrow = 2, scales = "free_y") + + facet_wrap(~Sample, nrow = 2, ) + ggtitle("Mismatch type") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) -ggsave("../report/images_read2/mt.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mt.png", width = 2250, height = 1500, units = "px") - -mtype2_2 <- apply(tp2[tp2$Sample==1, c(12:23)], 2, mean) -mtype2_2 <- data.frame("mean" = mtype2_2) -mtype2_2$type<-as.factor(colnames(tp2[, c(12:23)])) -mtype2_2<-cbind(mtype2_2, "Sample" = 1) +mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0)) +colnames(mtype2_2)<- c("mean", "type", "Sample") for (i in samples) { temp <- apply(tp2[tp2$Sample==i, c(12:23)], 2, mean) temp <- data.frame("mean" = temp) @@ -128,21 +116,17 @@ for (i in samples) { ggplot(mtype2_2) + geom_line(aes(x = type, y = mean, group = Sample)) + geom_point(aes(x = type, y = mean)) + - #facet_wrap(~Sample, nrow = 3, scales = "free_y") + - facet_wrap(~Sample, nrow = 3) + + #facet_wrap(~Sample, nrow = 2, scales = "free_y") + + facet_wrap(~Sample, nrow = 2) + ggtitle("Mismatch type") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) -ggsave("../report/images_read2/mt_mean.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mt_mean.png", width = 2250, height = 1500, units = "px") rm(mtype2, mtype2_2) #Number of deletions per sample -delets2 <- tp2[tp2$Sample==1,][,10] -delets2 <- unlist(strsplit(delets2,"")) -delets2<- delets2[delets2 %in% c(letters, LETTERS)] -delets2<-cbind("Nucleotide"=delets2, "Sample" = 1) -delets2<-data.frame(delets2) -delets2$Nucleotide <- as.factor(delets2$Nucleotide) +delets2 <- data.frame(matrix(ncol = 2, nrow = 0)) +colnames(delets2)<- c("Nucleotide", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp,"")) @@ -153,22 +137,18 @@ for (i in samples) { delets2<-rbind(temp, delets2) } ggplot(delets2) + - geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") + - facet_wrap(~Sample, nrow = 3, scales = "free_y") + + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + + facet_wrap(~Sample, nrow = 2) + #ylim(0, 1500) + - ggtitle ("Deletes") + + ggtitle ("Deletions") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/delets.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/delets.png", width = 2250, height = 1500, units = "px") rm (delets2) #Number of insertions per sample -inserts2 <- tp2[tp2$Sample==1,][,11] -inserts2 <- unlist(strsplit(inserts2,"")) -inserts2<- inserts2[inserts2 %in% c(letters, LETTERS)] -inserts2<-cbind("Nucleotide"=inserts2, "Sample" = 1) -inserts2<-data.frame(inserts2) -inserts2$Nucleotide <- as.factor(inserts2$Nucleotide) +inserts2 <- data.frame(matrix(ncol = 2, nrow = 0)) +colnames(inserts2)<- c("Nucleotide", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp,"")) @@ -179,84 +159,86 @@ for (i in samples) { inserts2<-rbind(temp, inserts2) } ggplot(inserts2) + - ggtitle("Inserts") + - geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") + - facet_wrap(~Sample, nrow = 2, scales = "free_y") + + ggtitle("Insertions") + + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + + facet_wrap(~Sample, nrow = 2) + #ylim(0, 2000) + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/inserts.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/inserts.png", width = 2250, height = 1500, units = "px") rm (inserts2) #Mean, min and max values of insertions per sample -insert_mean2<-subset(tp2, Sample == 1)[,7] -insert_mean2<-cbind("Sample"=1, t(summary(insert_mean2))) +insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) +colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") for (i in samples) { - temp<-subset(tp2, Mean_length_insert!=0 & Sample == i)[,7] + temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,7] temp<-cbind("Sample"=i, t(summary(temp))) insert_mean2<-rbind(insert_mean2, temp) } +insert_mean2[, 2]<-as.numeric(insert_mean2[, 2]) +insert_mean2[, 4]<-as.numeric(insert_mean2[, 4]) +insert_mean2[, 5]<-as.numeric(insert_mean2[, 5]) +insert_mean2[, 7]<-as.numeric(insert_mean2[, 7]) ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + - scale_y_continuous(breaks = c(0:12), limits = c(0, 13)) + + scale_y_continuous(name = "Length") + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + - ggtitle("Insert mean length") + + ggtitle("Insertion mean length") + scale_x_discrete(name = "Sample") + - scale_y_continuous(name = "Length", breaks = seq(0, 18, 1)) + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/iml.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/iml.png", width = 2250, height = 1500, units = "px") rm (insert_mean2) #Mean, min and max values of deletions per sample -delete_mean2<-subset(tp2, Sample == 1)[,6] -delete_mean2<-cbind("Sample"=1, t(summary(delete_mean2))) +delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) +colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") for (i in samples) { temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,6] temp<-cbind("Sample"=i, t(summary(temp))) delete_mean2<-rbind(delete_mean2, temp) } +delete_mean2[, 2]<-as.numeric(delete_mean2[, 2]) +delete_mean2[, 4]<-as.numeric(delete_mean2[, 4]) +delete_mean2[, 5]<-as.numeric(delete_mean2[, 5]) +delete_mean2[, 7]<-as.numeric(delete_mean2[, 7]) ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + - scale_y_continuous(breaks = c(0:22), limits = c(0, 22)) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + - ggtitle("Delete mean length") + + ggtitle("Deletion mean length") + scale_x_discrete(name = "Sample") + - scale_y_continuous(name = "Length", breaks = seq(0, 12, 1)) + + scale_y_continuous(name = "Length") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/dml.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/dml.png", width = 2250, height = 1500, units = "px") rm(delete_mean2) #Number of mismatches per sample ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + - stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + ggtitle("Mismatches per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/mps_mean.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mps_mean.png", width = 2250, height = 1500, units = "px") ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + - stat_summary(fun = sum, geom="point") + - stat_summary(fun = sum, geom="line") + + stat_summary(fun = mean, geom="point") + + stat_summary(fun = mean, geom="line") + ggtitle("Mismatches per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/mps.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/mps.png", width = 2250, height = 1500, units = "px") #Number of deletions per cycle of sequence numbers_only <- function(x) !grepl("\\D", x) -dels2 <- tp2[tp2$Sample==1,][,10] -dels2 <- unlist(strsplit(dels2, split="|",fixed = TRUE)) -dels2 <- unlist(strsplit(dels2,",")) -dels2<- dels2[numbers_only(dels2)] -dels2<-cbind("Num"=dels2, "Sample" = 1) -dels2<-data.frame(dels2) +dels2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(dels2)<-c("Num", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) @@ -271,22 +253,18 @@ ggplot(dels2, aes(x = Num)) + #geom_density(stat = "count", alpha = 0.5) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + - facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 2) + #ylim(0, 1500) + - ggtitle ("Deletes per cycle") + + ggtitle ("Deletions per cycle") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/dpc.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/dpc.png", width = 2250, height = 1500, units = "px") rm(dels2) #Number of insertions per cycle of sequence -ins2 <- tp2[tp2$Sample==1,][,11] -ins2 <- unlist(strsplit(ins2, split="|",fixed = TRUE)) -ins2 <- unlist(strsplit(ins2,",")) -ins2<- ins2[numbers_only(ins2)] -ins2<-cbind("Num"=ins2, "Sample" = 1) -ins2<-data.frame(ins2) +ins2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(ins2)<-c("Num", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) @@ -300,22 +278,19 @@ ins2$Num<-as.numeric(ins2$Num) ggplot(ins2, aes(x = Num)) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + - facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 2, scales = "free_y") + #ylim(0, 1500) + - ggtitle ("Inserts per cycle") + + ggtitle ("Insertions per cycle") + scale_x_continuous(breaks=seq(0, 150, 25))+ - scale_y_continuous(drop=FALSE) theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/ipc.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/ipc.png", width = 2250, height = 1500, units = "px") rm (ins2) #Number of deletions per sample -parse_delets2 <- tp2[tp2$Sample==1,][,10] -parse_delets2 <- unlist(strsplit(parse_delets2, split="|",fixed = TRUE)) -parse_delets2<-cbind("Num"=parse_delets2, "Sample" = 1) -parse_delets2<-data.frame(parse_delets2) +parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(parse_delets2)<-c("Num", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) @@ -325,18 +300,16 @@ for (i in samples) { } ggplot(parse_delets2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + - ggtitle("Deletes per sample") + + ggtitle("Deletions per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/dps.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/dps.png", width = 2250, height = 1500, units = "px") rm (parse_delets2) #Number of insertions per sample -parse_inserts2 <- tp2[tp2$Sample==1,][,11] -parse_inserts2 <- unlist(strsplit(parse_inserts2, split="|",fixed = TRUE)) -parse_inserts2<-cbind("Num"=parse_inserts2, "Sample" = 1) -parse_inserts2<-data.frame(parse_inserts2) +parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(parse_inserts2)<-c("Num", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) @@ -346,20 +319,16 @@ for (i in samples) { } ggplot(parse_inserts2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + - ggtitle("Inserts per sample") + + ggtitle("Insertions per sample") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5)) -ggsave("../report/images_read2/ips.png", width = 2250, height = 1500, units = "px") +ggsave("../report/images_new_plasmid/ips.png", width = 2250, height = 1500, units = "px") rm (parse_inserts2, temp) #Frequency of insertion motifs per sample -insert_motifs2 <- tp2[tp2$Sample==1,][,11] -insert_motifs2 <- unlist(strsplit(insert_motifs2, split="|",fixed = TRUE)) -insert_motifs2 <- unlist(strsplit(insert_motifs2,",")) -insert_motifs2<- insert_motifs2[!numbers_only(insert_motifs2)] -insert_motifs2<-cbind("Num"=insert_motifs2, "Sample" = 1) -insert_motifs2<-data.frame(insert_motifs2) +insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(insert_motifs2)<-c("Num", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) @@ -372,19 +341,15 @@ for (i in samples) { ggplot(insert_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + - ggtitle("Insert motifs per sample") + + ggtitle("Insertion motifs per sample") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) -ggsave("../report/images_read2/imps.png", width = 2250, height = 2500, units = "px") +ggsave("../report/images_new_plasmid/imps.png", width = 3250, height = 2500, units = "px") rm (insert_motifs2, temp) #Frequency of deletion motifs per sample -delete_motifs2 <- tp2[tp2$Sample==1,][,10] -delete_motifs2 <- unlist(strsplit(delete_motifs2, split="|",fixed = TRUE)) -delete_motifs2 <- unlist(strsplit(delete_motifs2,",")) -delete_motifs2<- delete_motifs2[!numbers_only(delete_motifs2)] -delete_motifs2<-cbind("Num"=delete_motifs2, "Sample" = 1) -delete_motifs2<-data.frame(delete_motifs2) +delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(delete_motifs2)<-c("Num", "Sample") for (i in samples) { temp <- tp2[tp2$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) @@ -397,7 +362,7 @@ for (i in samples) { ggplot(delete_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + - ggtitle("Deletes motifs per sample") + + ggtitle("Deletions motifs per sample") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) -ggsave("../report/images_read2/dmps.png", width = 2250, height = 2500, units = "px") +ggsave("../report/images_new_plasmid/dmps.png", width = 4550, height = 2500, units = "px") rm (delete_motifs2, temp) \ No newline at end of file